Bullet Collision Detection & Physics Library
btVector3.cpp
Go to the documentation of this file.
1/*
2 Copyright (c) 2011 Apple Inc.
3 https://bulletphysics.org
4
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose,
8 including commercial applications, and to alter it and redistribute it freely,
9 subject to the following restrictions:
10
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
14
15 This source version has been altered.
16 */
17
18#if defined(_WIN32) || defined(__i386__)
19#define BT_USE_SSE_IN_API
20#endif
21
22#include "btVector3.h"
23
24#if defined BT_USE_SIMD_VECTOR3
25
26#if DEBUG
27#include <string.h> //for memset
28#endif
29
30#ifdef __APPLE__
31#include <stdint.h>
32typedef float float4 __attribute__((vector_size(16)));
33#else
34#define float4 __m128
35#endif
36//typedef uint32_t uint4 __attribute__ ((vector_size(16)));
37
38#if defined BT_USE_SSE || defined _WIN32
39
40#define LOG2_ARRAY_SIZE 6
41#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
42
43#include <emmintrin.h>
44
45long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
46long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
47{
48 const float4 *vertices = (const float4 *)vv;
49 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
50 float4 dotMax = btAssign128(-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY);
51 float4 vvec = _mm_loadu_ps(vec);
52 float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));
53 float4 vLo = _mm_movelh_ps(vvec, vvec);
54
55 long maxIndex = -1L;
56
57 size_t segment = 0;
58 float4 stack_array[STACK_ARRAY_COUNT];
59
60#if DEBUG
61 //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
62#endif
63
64 size_t index;
65 float4 max;
66 // Faster loop without cleanup code for full tiles
67 for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
68 {
69 max = dotMax;
70
71 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
72 { // do four dot products at a time. Carefully avoid touching the w element.
73 float4 v0 = vertices[0];
74 float4 v1 = vertices[1];
75 float4 v2 = vertices[2];
76 float4 v3 = vertices[3];
77 vertices += 4;
78
79 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
80 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
81 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
82 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
83
84 lo0 = lo0 * vLo;
85 lo1 = lo1 * vLo;
86 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
87 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
88 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
89 z = z * vHi;
90 x = x + y;
91 x = x + z;
92 stack_array[index] = x;
93 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
94
95 v0 = vertices[0];
96 v1 = vertices[1];
97 v2 = vertices[2];
98 v3 = vertices[3];
99 vertices += 4;
100
101 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
102 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
103 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
104 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
105
106 lo0 = lo0 * vLo;
107 lo1 = lo1 * vLo;
108 z = _mm_shuffle_ps(hi0, hi1, 0x88);
109 x = _mm_shuffle_ps(lo0, lo1, 0x88);
110 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
111 z = z * vHi;
112 x = x + y;
113 x = x + z;
114 stack_array[index + 1] = x;
115 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
116
117 v0 = vertices[0];
118 v1 = vertices[1];
119 v2 = vertices[2];
120 v3 = vertices[3];
121 vertices += 4;
122
123 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
124 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
125 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
126 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
127
128 lo0 = lo0 * vLo;
129 lo1 = lo1 * vLo;
130 z = _mm_shuffle_ps(hi0, hi1, 0x88);
131 x = _mm_shuffle_ps(lo0, lo1, 0x88);
132 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
133 z = z * vHi;
134 x = x + y;
135 x = x + z;
136 stack_array[index + 2] = x;
137 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
138
139 v0 = vertices[0];
140 v1 = vertices[1];
141 v2 = vertices[2];
142 v3 = vertices[3];
143 vertices += 4;
144
145 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
146 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
147 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
148 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
149
150 lo0 = lo0 * vLo;
151 lo1 = lo1 * vLo;
152 z = _mm_shuffle_ps(hi0, hi1, 0x88);
153 x = _mm_shuffle_ps(lo0, lo1, 0x88);
154 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
155 z = z * vHi;
156 x = x + y;
157 x = x + z;
158 stack_array[index + 3] = x;
159 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
160
161 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
162 }
163
164 // If we found a new max
165 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
166 {
167 // copy the new max across all lanes of our max accumulator
168 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
169 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
170
171 dotMax = max;
172
173 // find first occurrence of that max
174 size_t test;
175 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
176 {
177 }
178 // record where it is.
179 maxIndex = 4 * index + segment + indexTable[test];
180 }
181 }
182
183 // account for work we've already done
184 count -= segment;
185
186 // Deal with the last < STACK_ARRAY_COUNT vectors
187 max = dotMax;
188 index = 0;
189
190 if (btUnlikely(count > 16))
191 {
192 for (; index + 4 <= count / 4; index += 4)
193 { // do four dot products at a time. Carefully avoid touching the w element.
194 float4 v0 = vertices[0];
195 float4 v1 = vertices[1];
196 float4 v2 = vertices[2];
197 float4 v3 = vertices[3];
198 vertices += 4;
199
200 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
201 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
202 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
203 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
204
205 lo0 = lo0 * vLo;
206 lo1 = lo1 * vLo;
207 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
210 z = z * vHi;
211 x = x + y;
212 x = x + z;
213 stack_array[index] = x;
214 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
215
216 v0 = vertices[0];
217 v1 = vertices[1];
218 v2 = vertices[2];
219 v3 = vertices[3];
220 vertices += 4;
221
222 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
223 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
224 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
225 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
226
227 lo0 = lo0 * vLo;
228 lo1 = lo1 * vLo;
229 z = _mm_shuffle_ps(hi0, hi1, 0x88);
230 x = _mm_shuffle_ps(lo0, lo1, 0x88);
231 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
232 z = z * vHi;
233 x = x + y;
234 x = x + z;
235 stack_array[index + 1] = x;
236 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
237
238 v0 = vertices[0];
239 v1 = vertices[1];
240 v2 = vertices[2];
241 v3 = vertices[3];
242 vertices += 4;
243
244 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
245 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
246 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
247 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
248
249 lo0 = lo0 * vLo;
250 lo1 = lo1 * vLo;
251 z = _mm_shuffle_ps(hi0, hi1, 0x88);
252 x = _mm_shuffle_ps(lo0, lo1, 0x88);
253 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
254 z = z * vHi;
255 x = x + y;
256 x = x + z;
257 stack_array[index + 2] = x;
258 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
259
260 v0 = vertices[0];
261 v1 = vertices[1];
262 v2 = vertices[2];
263 v3 = vertices[3];
264 vertices += 4;
265
266 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
267 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
268 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
269 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
270
271 lo0 = lo0 * vLo;
272 lo1 = lo1 * vLo;
273 z = _mm_shuffle_ps(hi0, hi1, 0x88);
274 x = _mm_shuffle_ps(lo0, lo1, 0x88);
275 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
276 z = z * vHi;
277 x = x + y;
278 x = x + z;
279 stack_array[index + 3] = x;
280 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
281
282 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
283 }
284 }
285
286 size_t localCount = (count & -4L) - 4 * index;
287 if (localCount)
288 {
289#ifdef __APPLE__
290 float4 t0, t1, t2, t3, t4;
291 float4 *sap = &stack_array[index + localCount / 4];
292 vertices += localCount; // counter the offset
293 size_t byteIndex = -(localCount) * sizeof(float);
294 //AT&T Code style assembly
295 asm volatile(
296 ".align 4 \n\
297 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
298 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
299 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
300 movaps %[t0], %[max] // vertices[0] \n\
301 movlhps %[t1], %[max] // x0y0x1y1 \n\
302 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
303 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
304 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
305 movhlps %[t0], %[t1] // z0w0z1w1 \n\
306 movaps %[t3], %[t0] // vertices[2] \n\
307 movlhps %[t4], %[t0] // x2y2x3y3 \n\
308 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
309 movhlps %[t3], %[t4] // z2w2z3w3 \n\
310 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
311 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
312 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
313 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
314 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
315 addps %[t3], %[max] // x + y \n\
316 addps %[t1], %[max] // x + y + z \n\
317 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
318 maxps %[t2], %[max] // record max, restore max \n\
319 add $16, %[byteIndex] // advance loop counter\n\
320 jnz 0b \n\
321 "
322 : [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
323 : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
324 : "memory", "cc");
325 index += localCount / 4;
326#else
327 {
328 for (unsigned int i = 0; i < localCount / 4; i++, index++)
329 { // do four dot products at a time. Carefully avoid touching the w element.
330 float4 v0 = vertices[0];
331 float4 v1 = vertices[1];
332 float4 v2 = vertices[2];
333 float4 v3 = vertices[3];
334 vertices += 4;
335
336 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
337 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
338 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
339 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
340
341 lo0 = lo0 * vLo;
342 lo1 = lo1 * vLo;
343 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
344 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
345 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
346 z = z * vHi;
347 x = x + y;
348 x = x + z;
349 stack_array[index] = x;
350 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
351 }
352 }
353#endif //__APPLE__
354 }
355
356 // process the last few points
357 if (count & 3)
358 {
359 float4 v0, v1, v2, x, y, z;
360 switch (count & 3)
361 {
362 case 3:
363 {
364 v0 = vertices[0];
365 v1 = vertices[1];
366 v2 = vertices[2];
367
368 // Calculate 3 dot products, transpose, duplicate v2
369 float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
370 float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
371 lo0 = lo0 * vLo;
372 z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
373 z = z * vHi;
374 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
375 lo1 = lo1 * vLo;
376 x = _mm_shuffle_ps(lo0, lo1, 0x88);
377 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
378 }
379 break;
380 case 2:
381 {
382 v0 = vertices[0];
383 v1 = vertices[1];
384 float4 xy = _mm_movelh_ps(v0, v1);
385 z = _mm_movehl_ps(v1, v0);
386 xy = xy * vLo;
387 z = _mm_shuffle_ps(z, z, 0xa8);
388 x = _mm_shuffle_ps(xy, xy, 0xa8);
389 y = _mm_shuffle_ps(xy, xy, 0xfd);
390 z = z * vHi;
391 }
392 break;
393 case 1:
394 {
395 float4 xy = vertices[0];
396 z = _mm_shuffle_ps(xy, xy, 0xaa);
397 xy = xy * vLo;
398 z = z * vHi;
399 x = _mm_shuffle_ps(xy, xy, 0);
400 y = _mm_shuffle_ps(xy, xy, 0x55);
401 }
402 break;
403 }
404 x = x + y;
405 x = x + z;
406 stack_array[index] = x;
407 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
408 index++;
409 }
410
411 // if we found a new max.
412 if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
413 { // we found a new max. Search for it
414 // find max across the max vector, place in all elements of max -- big latency hit here
415 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
416 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
417
418 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
419 // this where it actually makes a difference is handled in the early out at the top of the function,
420 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
421 // complexity, and removed it.
422
423 dotMax = max;
424
425 // scan for the first occurence of max in the array
426 size_t test;
427 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
428 {
429 }
430 maxIndex = 4 * index + segment + indexTable[test];
431 }
432
433 _mm_store_ss(dotResult, dotMax);
434 return maxIndex;
435}
436
437long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
438
439long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
440{
441 const float4 *vertices = (const float4 *)vv;
442 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
443 float4 dotmin = btAssign128(BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY);
444 float4 vvec = _mm_loadu_ps(vec);
445 float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));
446 float4 vLo = _mm_movelh_ps(vvec, vvec);
447
448 long minIndex = -1L;
449
450 size_t segment = 0;
451 float4 stack_array[STACK_ARRAY_COUNT];
452
453#if DEBUG
454 //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
455#endif
456
457 size_t index;
458 float4 min;
459 // Faster loop without cleanup code for full tiles
460 for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
461 {
462 min = dotmin;
463
464 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
465 { // do four dot products at a time. Carefully avoid touching the w element.
466 float4 v0 = vertices[0];
467 float4 v1 = vertices[1];
468 float4 v2 = vertices[2];
469 float4 v3 = vertices[3];
470 vertices += 4;
471
472 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
473 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
474 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
475 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
476
477 lo0 = lo0 * vLo;
478 lo1 = lo1 * vLo;
479 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
480 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
481 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
482 z = z * vHi;
483 x = x + y;
484 x = x + z;
485 stack_array[index] = x;
486 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
487
488 v0 = vertices[0];
489 v1 = vertices[1];
490 v2 = vertices[2];
491 v3 = vertices[3];
492 vertices += 4;
493
494 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
495 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
496 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
497 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
498
499 lo0 = lo0 * vLo;
500 lo1 = lo1 * vLo;
501 z = _mm_shuffle_ps(hi0, hi1, 0x88);
502 x = _mm_shuffle_ps(lo0, lo1, 0x88);
503 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
504 z = z * vHi;
505 x = x + y;
506 x = x + z;
507 stack_array[index + 1] = x;
508 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
509
510 v0 = vertices[0];
511 v1 = vertices[1];
512 v2 = vertices[2];
513 v3 = vertices[3];
514 vertices += 4;
515
516 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
517 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
518 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
519 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
520
521 lo0 = lo0 * vLo;
522 lo1 = lo1 * vLo;
523 z = _mm_shuffle_ps(hi0, hi1, 0x88);
524 x = _mm_shuffle_ps(lo0, lo1, 0x88);
525 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
526 z = z * vHi;
527 x = x + y;
528 x = x + z;
529 stack_array[index + 2] = x;
530 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
531
532 v0 = vertices[0];
533 v1 = vertices[1];
534 v2 = vertices[2];
535 v3 = vertices[3];
536 vertices += 4;
537
538 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
539 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
540 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
541 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
542
543 lo0 = lo0 * vLo;
544 lo1 = lo1 * vLo;
545 z = _mm_shuffle_ps(hi0, hi1, 0x88);
546 x = _mm_shuffle_ps(lo0, lo1, 0x88);
547 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
548 z = z * vHi;
549 x = x + y;
550 x = x + z;
551 stack_array[index + 3] = x;
552 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
553
554 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
555 }
556
557 // If we found a new min
558 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
559 {
560 // copy the new min across all lanes of our min accumulator
561 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
562 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
563
564 dotmin = min;
565
566 // find first occurrence of that min
567 size_t test;
568 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
569 {
570 }
571 // record where it is.
572 minIndex = 4 * index + segment + indexTable[test];
573 }
574 }
575
576 // account for work we've already done
577 count -= segment;
578
579 // Deal with the last < STACK_ARRAY_COUNT vectors
580 min = dotmin;
581 index = 0;
582
583 if (btUnlikely(count > 16))
584 {
585 for (; index + 4 <= count / 4; index += 4)
586 { // do four dot products at a time. Carefully avoid touching the w element.
587 float4 v0 = vertices[0];
588 float4 v1 = vertices[1];
589 float4 v2 = vertices[2];
590 float4 v3 = vertices[3];
591 vertices += 4;
592
593 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
594 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
595 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
596 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
597
598 lo0 = lo0 * vLo;
599 lo1 = lo1 * vLo;
600 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
601 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
602 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
603 z = z * vHi;
604 x = x + y;
605 x = x + z;
606 stack_array[index] = x;
607 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
608
609 v0 = vertices[0];
610 v1 = vertices[1];
611 v2 = vertices[2];
612 v3 = vertices[3];
613 vertices += 4;
614
615 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
616 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
617 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
618 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
619
620 lo0 = lo0 * vLo;
621 lo1 = lo1 * vLo;
622 z = _mm_shuffle_ps(hi0, hi1, 0x88);
623 x = _mm_shuffle_ps(lo0, lo1, 0x88);
624 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
625 z = z * vHi;
626 x = x + y;
627 x = x + z;
628 stack_array[index + 1] = x;
629 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
630
631 v0 = vertices[0];
632 v1 = vertices[1];
633 v2 = vertices[2];
634 v3 = vertices[3];
635 vertices += 4;
636
637 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
638 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
639 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
640 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
641
642 lo0 = lo0 * vLo;
643 lo1 = lo1 * vLo;
644 z = _mm_shuffle_ps(hi0, hi1, 0x88);
645 x = _mm_shuffle_ps(lo0, lo1, 0x88);
646 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
647 z = z * vHi;
648 x = x + y;
649 x = x + z;
650 stack_array[index + 2] = x;
651 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
652
653 v0 = vertices[0];
654 v1 = vertices[1];
655 v2 = vertices[2];
656 v3 = vertices[3];
657 vertices += 4;
658
659 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
660 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
661 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
662 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
663
664 lo0 = lo0 * vLo;
665 lo1 = lo1 * vLo;
666 z = _mm_shuffle_ps(hi0, hi1, 0x88);
667 x = _mm_shuffle_ps(lo0, lo1, 0x88);
668 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
669 z = z * vHi;
670 x = x + y;
671 x = x + z;
672 stack_array[index + 3] = x;
673 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
674
675 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
676 }
677 }
678
679 size_t localCount = (count & -4L) - 4 * index;
680 if (localCount)
681 {
682#ifdef __APPLE__
683 vertices += localCount; // counter the offset
684 float4 t0, t1, t2, t3, t4;
685 size_t byteIndex = -(localCount) * sizeof(float);
686 float4 *sap = &stack_array[index + localCount / 4];
687
688 asm volatile(
689 ".align 4 \n\
690 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
691 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
692 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
693 movaps %[t0], %[min] // vertices[0] \n\
694 movlhps %[t1], %[min] // x0y0x1y1 \n\
695 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
696 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
697 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
698 movhlps %[t0], %[t1] // z0w0z1w1 \n\
699 movaps %[t3], %[t0] // vertices[2] \n\
700 movlhps %[t4], %[t0] // x2y2x3y3 \n\
701 movhlps %[t3], %[t4] // z2w2z3w3 \n\
702 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
703 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
704 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
705 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
706 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
707 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
708 addps %[t3], %[min] // x + y \n\
709 addps %[t1], %[min] // x + y + z \n\
710 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
711 minps %[t2], %[min] // record min, restore min \n\
712 add $16, %[byteIndex] // advance loop counter\n\
713 jnz 0b \n\
714 "
715 : [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
716 : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
717 : "memory", "cc");
718 index += localCount / 4;
719#else
720 {
721 for (unsigned int i = 0; i < localCount / 4; i++, index++)
722 { // do four dot products at a time. Carefully avoid touching the w element.
723 float4 v0 = vertices[0];
724 float4 v1 = vertices[1];
725 float4 v2 = vertices[2];
726 float4 v3 = vertices[3];
727 vertices += 4;
728
729 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
730 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
731 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
732 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
733
734 lo0 = lo0 * vLo;
735 lo1 = lo1 * vLo;
736 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
737 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
738 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
739 z = z * vHi;
740 x = x + y;
741 x = x + z;
742 stack_array[index] = x;
743 min = _mm_min_ps(x, min); // control the order here so that max is never NaN even if x is nan
744 }
745 }
746
747#endif
748 }
749
750 // process the last few points
751 if (count & 3)
752 {
753 float4 v0, v1, v2, x, y, z;
754 switch (count & 3)
755 {
756 case 3:
757 {
758 v0 = vertices[0];
759 v1 = vertices[1];
760 v2 = vertices[2];
761
762 // Calculate 3 dot products, transpose, duplicate v2
763 float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
764 float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
765 lo0 = lo0 * vLo;
766 z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
767 z = z * vHi;
768 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
769 lo1 = lo1 * vLo;
770 x = _mm_shuffle_ps(lo0, lo1, 0x88);
771 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
772 }
773 break;
774 case 2:
775 {
776 v0 = vertices[0];
777 v1 = vertices[1];
778 float4 xy = _mm_movelh_ps(v0, v1);
779 z = _mm_movehl_ps(v1, v0);
780 xy = xy * vLo;
781 z = _mm_shuffle_ps(z, z, 0xa8);
782 x = _mm_shuffle_ps(xy, xy, 0xa8);
783 y = _mm_shuffle_ps(xy, xy, 0xfd);
784 z = z * vHi;
785 }
786 break;
787 case 1:
788 {
789 float4 xy = vertices[0];
790 z = _mm_shuffle_ps(xy, xy, 0xaa);
791 xy = xy * vLo;
792 z = z * vHi;
793 x = _mm_shuffle_ps(xy, xy, 0);
794 y = _mm_shuffle_ps(xy, xy, 0x55);
795 }
796 break;
797 }
798 x = x + y;
799 x = x + z;
800 stack_array[index] = x;
801 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
802 index++;
803 }
804
805 // if we found a new min.
806 if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
807 { // we found a new min. Search for it
808 // find min across the min vector, place in all elements of min -- big latency hit here
809 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
810 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
811
812 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
813 // this where it actually makes a difference is handled in the early out at the top of the function,
814 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
815 // complexity, and removed it.
816
817 dotmin = min;
818
819 // scan for the first occurence of min in the array
820 size_t test;
821 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
822 {
823 }
824 minIndex = 4 * index + segment + indexTable[test];
825 }
826
827 _mm_store_ss(dotResult, dotmin);
828 return minIndex;
829}
830
831#elif defined BT_USE_NEON
832
833#define ARM_NEON_GCC_COMPATIBILITY 1
834#include <arm_neon.h>
835#include <sys/types.h>
836#include <sys/sysctl.h> //for sysctlbyname
837
838static long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
839static long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
840static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
841static long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
842static long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
843static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
844
845long (*_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _maxdot_large_sel;
846long (*_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _mindot_large_sel;
847
848static inline uint32_t btGetCpuCapabilities(void)
849{
850 static uint32_t capabilities = 0;
851 static bool testedCapabilities = false;
852
853 if (0 == testedCapabilities)
854 {
855 uint32_t hasFeature = 0;
856 size_t featureSize = sizeof(hasFeature);
857 int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
858
859 if (0 == err && hasFeature)
860 capabilities |= 0x2000;
861
862 testedCapabilities = true;
863 }
864
865 return capabilities;
866}
867
868static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
869{
870 if (btGetCpuCapabilities() & 0x2000)
871 _maxdot_large = _maxdot_large_v1;
872 else
873 _maxdot_large = _maxdot_large_v0;
874
875 return _maxdot_large(vv, vec, count, dotResult);
876}
877
878static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
879{
880 if (btGetCpuCapabilities() & 0x2000)
881 _mindot_large = _mindot_large_v1;
882 else
883 _mindot_large = _mindot_large_v0;
884
885 return _mindot_large(vv, vec, count, dotResult);
886}
887
888#if defined __arm__
889#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
890#else
891//support 64bit arm
892#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
893#endif
894
895long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
896{
897 unsigned long i = 0;
898 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
899 float32x2_t vLo = vget_low_f32(vvec);
900 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
901 float32x2_t dotMaxLo = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
902 float32x2_t dotMaxHi = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
903 uint32x2_t indexLo = (uint32x2_t){0, 1};
904 uint32x2_t indexHi = (uint32x2_t){2, 3};
905 uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
906 uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
907 const uint32x2_t four = (uint32x2_t){4, 4};
908
909 for (; i + 8 <= count; i += 8)
910 {
911 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
912 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
913 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
914 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
915
916 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
917 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
918 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
919 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
920
921 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
922 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
923 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
924 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
925
926 float32x2_t rLo = vpadd_f32(xy0, xy1);
927 float32x2_t rHi = vpadd_f32(xy2, xy3);
928 rLo = vadd_f32(rLo, zLo);
929 rHi = vadd_f32(rHi, zHi);
930
931 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
932 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
933 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
934 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
935 iLo = vbsl_u32(maskLo, indexLo, iLo);
936 iHi = vbsl_u32(maskHi, indexHi, iHi);
937 indexLo = vadd_u32(indexLo, four);
938 indexHi = vadd_u32(indexHi, four);
939
940 v0 = vld1q_f32_aligned_postincrement(vv);
941 v1 = vld1q_f32_aligned_postincrement(vv);
942 v2 = vld1q_f32_aligned_postincrement(vv);
943 v3 = vld1q_f32_aligned_postincrement(vv);
944
945 xy0 = vmul_f32(vget_low_f32(v0), vLo);
946 xy1 = vmul_f32(vget_low_f32(v1), vLo);
947 xy2 = vmul_f32(vget_low_f32(v2), vLo);
948 xy3 = vmul_f32(vget_low_f32(v3), vLo);
949
950 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
951 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
952 zLo = vmul_f32(z0.val[0], vHi);
953 zHi = vmul_f32(z1.val[0], vHi);
954
955 rLo = vpadd_f32(xy0, xy1);
956 rHi = vpadd_f32(xy2, xy3);
957 rLo = vadd_f32(rLo, zLo);
958 rHi = vadd_f32(rHi, zHi);
959
960 maskLo = vcgt_f32(rLo, dotMaxLo);
961 maskHi = vcgt_f32(rHi, dotMaxHi);
962 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
963 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
964 iLo = vbsl_u32(maskLo, indexLo, iLo);
965 iHi = vbsl_u32(maskHi, indexHi, iHi);
966 indexLo = vadd_u32(indexLo, four);
967 indexHi = vadd_u32(indexHi, four);
968 }
969
970 for (; i + 4 <= count; i += 4)
971 {
972 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
973 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
974 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
975 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
976
977 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
978 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
979 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
980 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
981
982 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
983 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
984 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
985 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
986
987 float32x2_t rLo = vpadd_f32(xy0, xy1);
988 float32x2_t rHi = vpadd_f32(xy2, xy3);
989 rLo = vadd_f32(rLo, zLo);
990 rHi = vadd_f32(rHi, zHi);
991
992 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
993 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
994 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
995 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
996 iLo = vbsl_u32(maskLo, indexLo, iLo);
997 iHi = vbsl_u32(maskHi, indexHi, iHi);
998 indexLo = vadd_u32(indexLo, four);
999 indexHi = vadd_u32(indexHi, four);
1000 }
1001
1002 switch (count & 3)
1003 {
1004 case 3:
1005 {
1006 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1007 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1008 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1009
1010 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1011 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1012 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1013
1014 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1015 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1016 float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1017
1018 float32x2_t rLo = vpadd_f32(xy0, xy1);
1019 float32x2_t rHi = vpadd_f32(xy2, xy2);
1020 rLo = vadd_f32(rLo, zLo);
1021 rHi = vadd_f32(rHi, zHi);
1022
1023 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1024 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
1025 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1026 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
1027 iLo = vbsl_u32(maskLo, indexLo, iLo);
1028 iHi = vbsl_u32(maskHi, indexHi, iHi);
1029 }
1030 break;
1031 case 2:
1032 {
1033 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1034 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1035
1036 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1037 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1038
1039 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1040 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1041
1042 float32x2_t rLo = vpadd_f32(xy0, xy1);
1043 rLo = vadd_f32(rLo, zLo);
1044
1045 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1046 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1047 iLo = vbsl_u32(maskLo, indexLo, iLo);
1048 }
1049 break;
1050 case 1:
1051 {
1052 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1053 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1054 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1055 float32x2_t zLo = vmul_f32(z0, vHi);
1056 float32x2_t rLo = vpadd_f32(xy0, xy0);
1057 rLo = vadd_f32(rLo, zLo);
1058 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1059 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1060 iLo = vbsl_u32(maskLo, indexLo, iLo);
1061 }
1062 break;
1063
1064 default:
1065 break;
1066 }
1067
1068 // select best answer between hi and lo results
1069 uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
1070 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1071 iLo = vbsl_u32(mask, iHi, iLo);
1072
1073 // select best answer between even and odd results
1074 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1075 iHi = vdup_lane_u32(iLo, 1);
1076 mask = vcgt_f32(dotMaxHi, dotMaxLo);
1077 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1078 iLo = vbsl_u32(mask, iHi, iLo);
1079
1080 *dotResult = vget_lane_f32(dotMaxLo, 0);
1081 return vget_lane_u32(iLo, 0);
1082}
1083
1084long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1085{
1086 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1087 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1088 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1089 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1090 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1091 uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1092 float32x4_t maxDot = (float32x4_t){-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY};
1093
1094 unsigned long i = 0;
1095 for (; i + 8 <= count; i += 8)
1096 {
1097 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1098 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1099 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1100 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1101
1102 // the next two lines should resolve to a single vswp d, d
1103 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1104 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1105 // the next two lines should resolve to a single vswp d, d
1106 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1107 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1108
1109 xy0 = vmulq_f32(xy0, vLo);
1110 xy1 = vmulq_f32(xy1, vLo);
1111
1112 float32x4x2_t zb = vuzpq_f32(z0, z1);
1113 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1114 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1115 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1116 x = vaddq_f32(x, z);
1117
1118 uint32x4_t mask = vcgtq_f32(x, maxDot);
1119 maxDot = vbslq_f32(mask, x, maxDot);
1120 index = vbslq_u32(mask, local_index, index);
1121 local_index = vaddq_u32(local_index, four);
1122
1123 v0 = vld1q_f32_aligned_postincrement(vv);
1124 v1 = vld1q_f32_aligned_postincrement(vv);
1125 v2 = vld1q_f32_aligned_postincrement(vv);
1126 v3 = vld1q_f32_aligned_postincrement(vv);
1127
1128 // the next two lines should resolve to a single vswp d, d
1129 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1130 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1131 // the next two lines should resolve to a single vswp d, d
1132 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1133 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1134
1135 xy0 = vmulq_f32(xy0, vLo);
1136 xy1 = vmulq_f32(xy1, vLo);
1137
1138 zb = vuzpq_f32(z0, z1);
1139 z = vmulq_f32(zb.val[0], vHi);
1140 xy = vuzpq_f32(xy0, xy1);
1141 x = vaddq_f32(xy.val[0], xy.val[1]);
1142 x = vaddq_f32(x, z);
1143
1144 mask = vcgtq_f32(x, maxDot);
1145 maxDot = vbslq_f32(mask, x, maxDot);
1146 index = vbslq_u32(mask, local_index, index);
1147 local_index = vaddq_u32(local_index, four);
1148 }
1149
1150 for (; i + 4 <= count; i += 4)
1151 {
1152 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1153 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1154 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1155 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1156
1157 // the next two lines should resolve to a single vswp d, d
1158 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1159 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1160 // the next two lines should resolve to a single vswp d, d
1161 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1162 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1163
1164 xy0 = vmulq_f32(xy0, vLo);
1165 xy1 = vmulq_f32(xy1, vLo);
1166
1167 float32x4x2_t zb = vuzpq_f32(z0, z1);
1168 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1169 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1170 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1171 x = vaddq_f32(x, z);
1172
1173 uint32x4_t mask = vcgtq_f32(x, maxDot);
1174 maxDot = vbslq_f32(mask, x, maxDot);
1175 index = vbslq_u32(mask, local_index, index);
1176 local_index = vaddq_u32(local_index, four);
1177 }
1178
1179 switch (count & 3)
1180 {
1181 case 3:
1182 {
1183 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1184 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1185 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1186
1187 // the next two lines should resolve to a single vswp d, d
1188 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1189 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1190 // the next two lines should resolve to a single vswp d, d
1191 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1192 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1193
1194 xy0 = vmulq_f32(xy0, vLo);
1195 xy1 = vmulq_f32(xy1, vLo);
1196
1197 float32x4x2_t zb = vuzpq_f32(z0, z1);
1198 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1199 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1200 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1201 x = vaddq_f32(x, z);
1202
1203 uint32x4_t mask = vcgtq_f32(x, maxDot);
1204 maxDot = vbslq_f32(mask, x, maxDot);
1205 index = vbslq_u32(mask, local_index, index);
1206 local_index = vaddq_u32(local_index, four);
1207 }
1208 break;
1209
1210 case 2:
1211 {
1212 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1213 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1214
1215 // the next two lines should resolve to a single vswp d, d
1216 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1217 // the next two lines should resolve to a single vswp d, d
1218 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1219
1220 xy0 = vmulq_f32(xy0, vLo);
1221
1222 float32x4x2_t zb = vuzpq_f32(z0, z0);
1223 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1224 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1225 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1226 x = vaddq_f32(x, z);
1227
1228 uint32x4_t mask = vcgtq_f32(x, maxDot);
1229 maxDot = vbslq_f32(mask, x, maxDot);
1230 index = vbslq_u32(mask, local_index, index);
1231 local_index = vaddq_u32(local_index, four);
1232 }
1233 break;
1234
1235 case 1:
1236 {
1237 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1238
1239 // the next two lines should resolve to a single vswp d, d
1240 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1241 // the next two lines should resolve to a single vswp d, d
1242 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1243
1244 xy0 = vmulq_f32(xy0, vLo);
1245
1246 z = vmulq_f32(z, vHi);
1247 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1248 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1249 x = vaddq_f32(x, z);
1250
1251 uint32x4_t mask = vcgtq_f32(x, maxDot);
1252 maxDot = vbslq_f32(mask, x, maxDot);
1253 index = vbslq_u32(mask, local_index, index);
1254 local_index = vaddq_u32(local_index, four);
1255 }
1256 break;
1257
1258 default:
1259 break;
1260 }
1261
1262 // select best answer between hi and lo results
1263 uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
1264 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1265 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1266
1267 // select best answer between even and odd results
1268 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1269 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1270 mask = vcgt_f32(maxDotO, maxDot2);
1271 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1272 index2 = vbsl_u32(mask, indexHi, index2);
1273
1274 *dotResult = vget_lane_f32(maxDot2, 0);
1275 return vget_lane_u32(index2, 0);
1276}
1277
1278long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
1279{
1280 unsigned long i = 0;
1281 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1282 float32x2_t vLo = vget_low_f32(vvec);
1283 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1284 float32x2_t dotMinLo = (float32x2_t){BT_INFINITY, BT_INFINITY};
1285 float32x2_t dotMinHi = (float32x2_t){BT_INFINITY, BT_INFINITY};
1286 uint32x2_t indexLo = (uint32x2_t){0, 1};
1287 uint32x2_t indexHi = (uint32x2_t){2, 3};
1288 uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1289 uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1290 const uint32x2_t four = (uint32x2_t){4, 4};
1291
1292 for (; i + 8 <= count; i += 8)
1293 {
1294 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1295 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1296 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1297 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1298
1299 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1300 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1301 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1302 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1303
1304 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1305 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1306 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1307 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1308
1309 float32x2_t rLo = vpadd_f32(xy0, xy1);
1310 float32x2_t rHi = vpadd_f32(xy2, xy3);
1311 rLo = vadd_f32(rLo, zLo);
1312 rHi = vadd_f32(rHi, zHi);
1313
1314 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1315 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1316 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1317 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1318 iLo = vbsl_u32(maskLo, indexLo, iLo);
1319 iHi = vbsl_u32(maskHi, indexHi, iHi);
1320 indexLo = vadd_u32(indexLo, four);
1321 indexHi = vadd_u32(indexHi, four);
1322
1323 v0 = vld1q_f32_aligned_postincrement(vv);
1324 v1 = vld1q_f32_aligned_postincrement(vv);
1325 v2 = vld1q_f32_aligned_postincrement(vv);
1326 v3 = vld1q_f32_aligned_postincrement(vv);
1327
1328 xy0 = vmul_f32(vget_low_f32(v0), vLo);
1329 xy1 = vmul_f32(vget_low_f32(v1), vLo);
1330 xy2 = vmul_f32(vget_low_f32(v2), vLo);
1331 xy3 = vmul_f32(vget_low_f32(v3), vLo);
1332
1333 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1334 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1335 zLo = vmul_f32(z0.val[0], vHi);
1336 zHi = vmul_f32(z1.val[0], vHi);
1337
1338 rLo = vpadd_f32(xy0, xy1);
1339 rHi = vpadd_f32(xy2, xy3);
1340 rLo = vadd_f32(rLo, zLo);
1341 rHi = vadd_f32(rHi, zHi);
1342
1343 maskLo = vclt_f32(rLo, dotMinLo);
1344 maskHi = vclt_f32(rHi, dotMinHi);
1345 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1346 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1347 iLo = vbsl_u32(maskLo, indexLo, iLo);
1348 iHi = vbsl_u32(maskHi, indexHi, iHi);
1349 indexLo = vadd_u32(indexLo, four);
1350 indexHi = vadd_u32(indexHi, four);
1351 }
1352
1353 for (; i + 4 <= count; i += 4)
1354 {
1355 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1356 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1357 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1358 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1359
1360 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1361 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1362 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1363 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1364
1365 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1366 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1367 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1368 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1369
1370 float32x2_t rLo = vpadd_f32(xy0, xy1);
1371 float32x2_t rHi = vpadd_f32(xy2, xy3);
1372 rLo = vadd_f32(rLo, zLo);
1373 rHi = vadd_f32(rHi, zHi);
1374
1375 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1376 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1377 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1378 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1379 iLo = vbsl_u32(maskLo, indexLo, iLo);
1380 iHi = vbsl_u32(maskHi, indexHi, iHi);
1381 indexLo = vadd_u32(indexLo, four);
1382 indexHi = vadd_u32(indexHi, four);
1383 }
1384 switch (count & 3)
1385 {
1386 case 3:
1387 {
1388 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1389 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1390 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1391
1392 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1393 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1394 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1395
1396 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1397 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1398 float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1399
1400 float32x2_t rLo = vpadd_f32(xy0, xy1);
1401 float32x2_t rHi = vpadd_f32(xy2, xy2);
1402 rLo = vadd_f32(rLo, zLo);
1403 rHi = vadd_f32(rHi, zHi);
1404
1405 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1406 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1407 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1408 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1409 iLo = vbsl_u32(maskLo, indexLo, iLo);
1410 iHi = vbsl_u32(maskHi, indexHi, iHi);
1411 }
1412 break;
1413 case 2:
1414 {
1415 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1416 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1417
1418 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1419 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1420
1421 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1422 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1423
1424 float32x2_t rLo = vpadd_f32(xy0, xy1);
1425 rLo = vadd_f32(rLo, zLo);
1426
1427 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1428 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1429 iLo = vbsl_u32(maskLo, indexLo, iLo);
1430 }
1431 break;
1432 case 1:
1433 {
1434 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1435 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1436 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1437 float32x2_t zLo = vmul_f32(z0, vHi);
1438 float32x2_t rLo = vpadd_f32(xy0, xy0);
1439 rLo = vadd_f32(rLo, zLo);
1440 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1441 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1442 iLo = vbsl_u32(maskLo, indexLo, iLo);
1443 }
1444 break;
1445
1446 default:
1447 break;
1448 }
1449
1450 // select best answer between hi and lo results
1451 uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
1452 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1453 iLo = vbsl_u32(mask, iHi, iLo);
1454
1455 // select best answer between even and odd results
1456 dotMinHi = vdup_lane_f32(dotMinLo, 1);
1457 iHi = vdup_lane_u32(iLo, 1);
1458 mask = vclt_f32(dotMinHi, dotMinLo);
1459 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1460 iLo = vbsl_u32(mask, iHi, iLo);
1461
1462 *dotResult = vget_lane_f32(dotMinLo, 0);
1463 return vget_lane_u32(iLo, 0);
1464}
1465
1466long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1467{
1468 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1469 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1470 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1471 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1472 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1473 uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1474 float32x4_t minDot = (float32x4_t){BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY};
1475
1476 unsigned long i = 0;
1477 for (; i + 8 <= count; i += 8)
1478 {
1479 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1480 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1481 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1482 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1483
1484 // the next two lines should resolve to a single vswp d, d
1485 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1486 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1487 // the next two lines should resolve to a single vswp d, d
1488 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1489 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1490
1491 xy0 = vmulq_f32(xy0, vLo);
1492 xy1 = vmulq_f32(xy1, vLo);
1493
1494 float32x4x2_t zb = vuzpq_f32(z0, z1);
1495 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1496 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1497 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1498 x = vaddq_f32(x, z);
1499
1500 uint32x4_t mask = vcltq_f32(x, minDot);
1501 minDot = vbslq_f32(mask, x, minDot);
1502 index = vbslq_u32(mask, local_index, index);
1503 local_index = vaddq_u32(local_index, four);
1504
1505 v0 = vld1q_f32_aligned_postincrement(vv);
1506 v1 = vld1q_f32_aligned_postincrement(vv);
1507 v2 = vld1q_f32_aligned_postincrement(vv);
1508 v3 = vld1q_f32_aligned_postincrement(vv);
1509
1510 // the next two lines should resolve to a single vswp d, d
1511 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1512 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1513 // the next two lines should resolve to a single vswp d, d
1514 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1515 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1516
1517 xy0 = vmulq_f32(xy0, vLo);
1518 xy1 = vmulq_f32(xy1, vLo);
1519
1520 zb = vuzpq_f32(z0, z1);
1521 z = vmulq_f32(zb.val[0], vHi);
1522 xy = vuzpq_f32(xy0, xy1);
1523 x = vaddq_f32(xy.val[0], xy.val[1]);
1524 x = vaddq_f32(x, z);
1525
1526 mask = vcltq_f32(x, minDot);
1527 minDot = vbslq_f32(mask, x, minDot);
1528 index = vbslq_u32(mask, local_index, index);
1529 local_index = vaddq_u32(local_index, four);
1530 }
1531
1532 for (; i + 4 <= count; i += 4)
1533 {
1534 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1535 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1536 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1537 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1538
1539 // the next two lines should resolve to a single vswp d, d
1540 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1541 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1542 // the next two lines should resolve to a single vswp d, d
1543 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1544 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1545
1546 xy0 = vmulq_f32(xy0, vLo);
1547 xy1 = vmulq_f32(xy1, vLo);
1548
1549 float32x4x2_t zb = vuzpq_f32(z0, z1);
1550 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1551 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1552 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1553 x = vaddq_f32(x, z);
1554
1555 uint32x4_t mask = vcltq_f32(x, minDot);
1556 minDot = vbslq_f32(mask, x, minDot);
1557 index = vbslq_u32(mask, local_index, index);
1558 local_index = vaddq_u32(local_index, four);
1559 }
1560
1561 switch (count & 3)
1562 {
1563 case 3:
1564 {
1565 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1566 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1567 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1568
1569 // the next two lines should resolve to a single vswp d, d
1570 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1571 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1572 // the next two lines should resolve to a single vswp d, d
1573 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1574 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1575
1576 xy0 = vmulq_f32(xy0, vLo);
1577 xy1 = vmulq_f32(xy1, vLo);
1578
1579 float32x4x2_t zb = vuzpq_f32(z0, z1);
1580 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1581 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1582 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1583 x = vaddq_f32(x, z);
1584
1585 uint32x4_t mask = vcltq_f32(x, minDot);
1586 minDot = vbslq_f32(mask, x, minDot);
1587 index = vbslq_u32(mask, local_index, index);
1588 local_index = vaddq_u32(local_index, four);
1589 }
1590 break;
1591
1592 case 2:
1593 {
1594 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1595 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1596
1597 // the next two lines should resolve to a single vswp d, d
1598 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1599 // the next two lines should resolve to a single vswp d, d
1600 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1601
1602 xy0 = vmulq_f32(xy0, vLo);
1603
1604 float32x4x2_t zb = vuzpq_f32(z0, z0);
1605 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1606 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1607 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1608 x = vaddq_f32(x, z);
1609
1610 uint32x4_t mask = vcltq_f32(x, minDot);
1611 minDot = vbslq_f32(mask, x, minDot);
1612 index = vbslq_u32(mask, local_index, index);
1613 local_index = vaddq_u32(local_index, four);
1614 }
1615 break;
1616
1617 case 1:
1618 {
1619 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1620
1621 // the next two lines should resolve to a single vswp d, d
1622 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1623 // the next two lines should resolve to a single vswp d, d
1624 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1625
1626 xy0 = vmulq_f32(xy0, vLo);
1627
1628 z = vmulq_f32(z, vHi);
1629 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1630 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1631 x = vaddq_f32(x, z);
1632
1633 uint32x4_t mask = vcltq_f32(x, minDot);
1634 minDot = vbslq_f32(mask, x, minDot);
1635 index = vbslq_u32(mask, local_index, index);
1636 local_index = vaddq_u32(local_index, four);
1637 }
1638 break;
1639
1640 default:
1641 break;
1642 }
1643
1644 // select best answer between hi and lo results
1645 uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
1646 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1647 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1648
1649 // select best answer between even and odd results
1650 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1651 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1652 mask = vclt_f32(minDotO, minDot2);
1653 minDot2 = vbsl_f32(mask, minDotO, minDot2);
1654 index2 = vbsl_u32(mask, indexHi, index2);
1655
1656 *dotResult = vget_lane_f32(minDot2, 0);
1657 return vget_lane_u32(index2, 0);
1658}
1659
1660#else
1661#error Unhandled __APPLE__ arch
1662#endif
1663
1664#endif /* __APPLE__ */
unsigned int uint32_t
#define btUnlikely(_c)
Definition: btScalar.h:159
#define BT_INFINITY
Definition: btScalar.h:405