Bullet Collision Detection & Physics Library
btVector3.cpp
Go to the documentation of this file.
1/*
2 Copyright (c) 2011 Apple Inc.
3 https://bulletphysics.org
4
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose,
8 including commercial applications, and to alter it and redistribute it freely,
9 subject to the following restrictions:
10
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
14
15 This source version has been altered.
16 */
17
18#if defined(_WIN32) || defined(__i386__)
19#define BT_USE_SSE_IN_API
20#endif
21
22#include "btVector3.h"
23
24#if defined BT_USE_SIMD_VECTOR3
25
26#if DEBUG
27#include <string.h> //for memset
28#endif
29
30#ifdef __APPLE__
31#include <stdint.h>
32typedef float float4 __attribute__((vector_size(16)));
33#else
34#define float4 __m128
35#endif
36//typedef uint32_t uint4 __attribute__ ((vector_size(16)));
37
38#if defined BT_USE_SSE || defined _WIN32
39
40#define LOG2_ARRAY_SIZE 6
41#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
42
43#include <emmintrin.h>
44
45long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
46long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
47{
48 const float4 *vertices = (const float4 *)vv;
49 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
54
55 long maxIndex = -1L;
56
57 size_t segment = 0;
59
60#if DEBUG
61 //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
62#endif
63
64 size_t index;
65 float4 max;
66 // Faster loop without cleanup code for full tiles
67 for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
68 {
69 max = dotMax;
70
71 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
72 { // do four dot products at a time. Carefully avoid touching the w element.
73 float4 v0 = vertices[0];
74 float4 v1 = vertices[1];
75 float4 v2 = vertices[2];
76 float4 v3 = vertices[3];
77 vertices += 4;
78
79 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
80 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
81 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
82 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
83
84 lo0 = lo0 * vLo;
85 lo1 = lo1 * vLo;
86 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
87 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
88 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
89 z = z * vHi;
90 x = x + y;
91 x = x + z;
92 stack_array[index] = x;
93 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
94
95 v0 = vertices[0];
96 v1 = vertices[1];
97 v2 = vertices[2];
98 v3 = vertices[3];
99 vertices += 4;
100
101 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
102 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
103 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
104 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
105
106 lo0 = lo0 * vLo;
107 lo1 = lo1 * vLo;
108 z = _mm_shuffle_ps(hi0, hi1, 0x88);
109 x = _mm_shuffle_ps(lo0, lo1, 0x88);
110 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
111 z = z * vHi;
112 x = x + y;
113 x = x + z;
114 stack_array[index + 1] = x;
115 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
116
117 v0 = vertices[0];
118 v1 = vertices[1];
119 v2 = vertices[2];
120 v3 = vertices[3];
121 vertices += 4;
122
123 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
124 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
125 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
126 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
127
128 lo0 = lo0 * vLo;
129 lo1 = lo1 * vLo;
130 z = _mm_shuffle_ps(hi0, hi1, 0x88);
131 x = _mm_shuffle_ps(lo0, lo1, 0x88);
132 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
133 z = z * vHi;
134 x = x + y;
135 x = x + z;
136 stack_array[index + 2] = x;
137 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
138
139 v0 = vertices[0];
140 v1 = vertices[1];
141 v2 = vertices[2];
142 v3 = vertices[3];
143 vertices += 4;
144
145 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
146 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
147 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
148 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
149
150 lo0 = lo0 * vLo;
151 lo1 = lo1 * vLo;
152 z = _mm_shuffle_ps(hi0, hi1, 0x88);
153 x = _mm_shuffle_ps(lo0, lo1, 0x88);
154 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
155 z = z * vHi;
156 x = x + y;
157 x = x + z;
158 stack_array[index + 3] = x;
159 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
160
161 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
162 }
163
164 // If we found a new max
165 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
166 {
167 // copy the new max across all lanes of our max accumulator
168 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
169 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
170
171 dotMax = max;
172
173 // find first occurrence of that max
174 size_t test;
175 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
176 {
177 }
178 // record where it is.
179 maxIndex = 4 * index + segment + indexTable[test];
180 }
181 }
182
183 // account for work we've already done
184 count -= segment;
185
186 // Deal with the last < STACK_ARRAY_COUNT vectors
187 max = dotMax;
188 index = 0;
189
190 if (btUnlikely(count > 16))
191 {
192 for (; index + 4 <= count / 4; index += 4)
193 { // do four dot products at a time. Carefully avoid touching the w element.
194 float4 v0 = vertices[0];
195 float4 v1 = vertices[1];
196 float4 v2 = vertices[2];
197 float4 v3 = vertices[3];
198 vertices += 4;
199
200 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
201 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
202 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
203 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
204
205 lo0 = lo0 * vLo;
206 lo1 = lo1 * vLo;
207 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
210 z = z * vHi;
211 x = x + y;
212 x = x + z;
213 stack_array[index] = x;
214 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
215
216 v0 = vertices[0];
217 v1 = vertices[1];
218 v2 = vertices[2];
219 v3 = vertices[3];
220 vertices += 4;
221
222 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
223 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
224 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
225 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
226
227 lo0 = lo0 * vLo;
228 lo1 = lo1 * vLo;
229 z = _mm_shuffle_ps(hi0, hi1, 0x88);
230 x = _mm_shuffle_ps(lo0, lo1, 0x88);
231 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
232 z = z * vHi;
233 x = x + y;
234 x = x + z;
235 stack_array[index + 1] = x;
236 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
237
238 v0 = vertices[0];
239 v1 = vertices[1];
240 v2 = vertices[2];
241 v3 = vertices[3];
242 vertices += 4;
243
244 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
245 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
246 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
247 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
248
249 lo0 = lo0 * vLo;
250 lo1 = lo1 * vLo;
251 z = _mm_shuffle_ps(hi0, hi1, 0x88);
252 x = _mm_shuffle_ps(lo0, lo1, 0x88);
253 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
254 z = z * vHi;
255 x = x + y;
256 x = x + z;
257 stack_array[index + 2] = x;
258 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
259
260 v0 = vertices[0];
261 v1 = vertices[1];
262 v2 = vertices[2];
263 v3 = vertices[3];
264 vertices += 4;
265
266 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
267 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
268 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
269 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
270
271 lo0 = lo0 * vLo;
272 lo1 = lo1 * vLo;
273 z = _mm_shuffle_ps(hi0, hi1, 0x88);
274 x = _mm_shuffle_ps(lo0, lo1, 0x88);
275 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
276 z = z * vHi;
277 x = x + y;
278 x = x + z;
279 stack_array[index + 3] = x;
280 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
281
282 // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
283 }
284 }
285
286 size_t localCount = (count & -4L) - 4 * index;
287 if (localCount)
288 {
289#ifdef __APPLE__
290 float4 t0, t1, t2, t3, t4;
291 float4 *sap = &stack_array[index + localCount / 4];
292 vertices += localCount; // counter the offset
293 size_t byteIndex = -(localCount) * sizeof(float);
294 //AT&T Code style assembly
295 asm volatile(
296 ".align 4 \n\
297 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
298 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
299 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
300 movaps %[t0], %[max] // vertices[0] \n\
301 movlhps %[t1], %[max] // x0y0x1y1 \n\
302 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
303 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
304 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
305 movhlps %[t0], %[t1] // z0w0z1w1 \n\
306 movaps %[t3], %[t0] // vertices[2] \n\
307 movlhps %[t4], %[t0] // x2y2x3y3 \n\
308 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
309 movhlps %[t3], %[t4] // z2w2z3w3 \n\
310 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
311 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
312 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
313 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
314 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
315 addps %[t3], %[max] // x + y \n\
316 addps %[t1], %[max] // x + y + z \n\
317 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
318 maxps %[t2], %[max] // record max, restore max \n\
319 add $16, %[byteIndex] // advance loop counter\n\
320 jnz 0b \n\
321 "
322 : [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
323 : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
324 : "memory", "cc");
325 index += localCount / 4;
326#else
327 {
328 for (unsigned int i = 0; i < localCount / 4; i++, index++)
329 { // do four dot products at a time. Carefully avoid touching the w element.
330 float4 v0 = vertices[0];
331 float4 v1 = vertices[1];
332 float4 v2 = vertices[2];
333 float4 v3 = vertices[3];
334 vertices += 4;
335
336 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
337 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
338 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
339 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
340
341 lo0 = lo0 * vLo;
342 lo1 = lo1 * vLo;
343 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
344 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
345 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
346 z = z * vHi;
347 x = x + y;
348 x = x + z;
349 stack_array[index] = x;
350 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
351 }
352 }
353#endif //__APPLE__
354 }
355
356 // process the last few points
357 if (count & 3)
358 {
359 float4 v0, v1, v2, x, y, z;
360 switch (count & 3)
361 {
362 case 3:
363 {
364 v0 = vertices[0];
365 v1 = vertices[1];
366 v2 = vertices[2];
367
368 // Calculate 3 dot products, transpose, duplicate v2
369 float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
370 float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
371 lo0 = lo0 * vLo;
372 z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
373 z = z * vHi;
374 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
375 lo1 = lo1 * vLo;
376 x = _mm_shuffle_ps(lo0, lo1, 0x88);
377 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
378 }
379 break;
380 case 2:
381 {
382 v0 = vertices[0];
383 v1 = vertices[1];
384 float4 xy = _mm_movelh_ps(v0, v1);
385 z = _mm_movehl_ps(v1, v0);
386 xy = xy * vLo;
387 z = _mm_shuffle_ps(z, z, 0xa8);
388 x = _mm_shuffle_ps(xy, xy, 0xa8);
389 y = _mm_shuffle_ps(xy, xy, 0xfd);
390 z = z * vHi;
391 }
392 break;
393 case 1:
394 {
395 float4 xy = vertices[0];
396 z = _mm_shuffle_ps(xy, xy, 0xaa);
397 xy = xy * vLo;
398 z = z * vHi;
399 x = _mm_shuffle_ps(xy, xy, 0);
400 y = _mm_shuffle_ps(xy, xy, 0x55);
401 }
402 break;
403 }
404 x = x + y;
405 x = x + z;
406 stack_array[index] = x;
407 max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
408 index++;
409 }
410
411 // if we found a new max.
412 if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
413 { // we found a new max. Search for it
414 // find max across the max vector, place in all elements of max -- big latency hit here
415 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
416 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
417
418 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
419 // this where it actually makes a difference is handled in the early out at the top of the function,
420 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
421 // complexity, and removed it.
422
423 dotMax = max;
424
425 // scan for the first occurence of max in the array
426 size_t test;
427 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
428 {
429 }
430 maxIndex = 4 * index + segment + indexTable[test];
431 }
432
434 return maxIndex;
435}
436
437long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
438
439long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
440{
441 const float4 *vertices = (const float4 *)vv;
442 static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
447
448 long minIndex = -1L;
449
450 size_t segment = 0;
452
453#if DEBUG
454 //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
455#endif
456
457 size_t index;
458 float4 min;
459 // Faster loop without cleanup code for full tiles
460 for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
461 {
462 min = dotmin;
463
464 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
465 { // do four dot products at a time. Carefully avoid touching the w element.
466 float4 v0 = vertices[0];
467 float4 v1 = vertices[1];
468 float4 v2 = vertices[2];
469 float4 v3 = vertices[3];
470 vertices += 4;
471
472 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
473 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
474 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
475 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
476
477 lo0 = lo0 * vLo;
478 lo1 = lo1 * vLo;
479 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
480 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
481 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
482 z = z * vHi;
483 x = x + y;
484 x = x + z;
485 stack_array[index] = x;
486 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
487
488 v0 = vertices[0];
489 v1 = vertices[1];
490 v2 = vertices[2];
491 v3 = vertices[3];
492 vertices += 4;
493
494 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
495 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
496 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
497 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
498
499 lo0 = lo0 * vLo;
500 lo1 = lo1 * vLo;
501 z = _mm_shuffle_ps(hi0, hi1, 0x88);
502 x = _mm_shuffle_ps(lo0, lo1, 0x88);
503 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
504 z = z * vHi;
505 x = x + y;
506 x = x + z;
507 stack_array[index + 1] = x;
508 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
509
510 v0 = vertices[0];
511 v1 = vertices[1];
512 v2 = vertices[2];
513 v3 = vertices[3];
514 vertices += 4;
515
516 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
517 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
518 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
519 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
520
521 lo0 = lo0 * vLo;
522 lo1 = lo1 * vLo;
523 z = _mm_shuffle_ps(hi0, hi1, 0x88);
524 x = _mm_shuffle_ps(lo0, lo1, 0x88);
525 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
526 z = z * vHi;
527 x = x + y;
528 x = x + z;
529 stack_array[index + 2] = x;
530 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
531
532 v0 = vertices[0];
533 v1 = vertices[1];
534 v2 = vertices[2];
535 v3 = vertices[3];
536 vertices += 4;
537
538 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
539 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
540 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
541 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
542
543 lo0 = lo0 * vLo;
544 lo1 = lo1 * vLo;
545 z = _mm_shuffle_ps(hi0, hi1, 0x88);
546 x = _mm_shuffle_ps(lo0, lo1, 0x88);
547 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
548 z = z * vHi;
549 x = x + y;
550 x = x + z;
551 stack_array[index + 3] = x;
552 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
553
554 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
555 }
556
557 // If we found a new min
558 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
559 {
560 // copy the new min across all lanes of our min accumulator
561 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
562 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
563
564 dotmin = min;
565
566 // find first occurrence of that min
567 size_t test;
568 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
569 {
570 }
571 // record where it is.
572 minIndex = 4 * index + segment + indexTable[test];
573 }
574 }
575
576 // account for work we've already done
577 count -= segment;
578
579 // Deal with the last < STACK_ARRAY_COUNT vectors
580 min = dotmin;
581 index = 0;
582
583 if (btUnlikely(count > 16))
584 {
585 for (; index + 4 <= count / 4; index += 4)
586 { // do four dot products at a time. Carefully avoid touching the w element.
587 float4 v0 = vertices[0];
588 float4 v1 = vertices[1];
589 float4 v2 = vertices[2];
590 float4 v3 = vertices[3];
591 vertices += 4;
592
593 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
594 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
595 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
596 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
597
598 lo0 = lo0 * vLo;
599 lo1 = lo1 * vLo;
600 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
601 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
602 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
603 z = z * vHi;
604 x = x + y;
605 x = x + z;
606 stack_array[index] = x;
607 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
608
609 v0 = vertices[0];
610 v1 = vertices[1];
611 v2 = vertices[2];
612 v3 = vertices[3];
613 vertices += 4;
614
615 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
616 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
617 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
618 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
619
620 lo0 = lo0 * vLo;
621 lo1 = lo1 * vLo;
622 z = _mm_shuffle_ps(hi0, hi1, 0x88);
623 x = _mm_shuffle_ps(lo0, lo1, 0x88);
624 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
625 z = z * vHi;
626 x = x + y;
627 x = x + z;
628 stack_array[index + 1] = x;
629 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
630
631 v0 = vertices[0];
632 v1 = vertices[1];
633 v2 = vertices[2];
634 v3 = vertices[3];
635 vertices += 4;
636
637 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
638 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
639 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
640 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
641
642 lo0 = lo0 * vLo;
643 lo1 = lo1 * vLo;
644 z = _mm_shuffle_ps(hi0, hi1, 0x88);
645 x = _mm_shuffle_ps(lo0, lo1, 0x88);
646 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
647 z = z * vHi;
648 x = x + y;
649 x = x + z;
650 stack_array[index + 2] = x;
651 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
652
653 v0 = vertices[0];
654 v1 = vertices[1];
655 v2 = vertices[2];
656 v3 = vertices[3];
657 vertices += 4;
658
659 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
660 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
661 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
662 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
663
664 lo0 = lo0 * vLo;
665 lo1 = lo1 * vLo;
666 z = _mm_shuffle_ps(hi0, hi1, 0x88);
667 x = _mm_shuffle_ps(lo0, lo1, 0x88);
668 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
669 z = z * vHi;
670 x = x + y;
671 x = x + z;
672 stack_array[index + 3] = x;
673 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
674
675 // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
676 }
677 }
678
679 size_t localCount = (count & -4L) - 4 * index;
680 if (localCount)
681 {
682#ifdef __APPLE__
683 vertices += localCount; // counter the offset
684 float4 t0, t1, t2, t3, t4;
685 size_t byteIndex = -(localCount) * sizeof(float);
686 float4 *sap = &stack_array[index + localCount / 4];
687
688 asm volatile(
689 ".align 4 \n\
690 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
691 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
692 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
693 movaps %[t0], %[min] // vertices[0] \n\
694 movlhps %[t1], %[min] // x0y0x1y1 \n\
695 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
696 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
697 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
698 movhlps %[t0], %[t1] // z0w0z1w1 \n\
699 movaps %[t3], %[t0] // vertices[2] \n\
700 movlhps %[t4], %[t0] // x2y2x3y3 \n\
701 movhlps %[t3], %[t4] // z2w2z3w3 \n\
702 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
703 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
704 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
705 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
706 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
707 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
708 addps %[t3], %[min] // x + y \n\
709 addps %[t1], %[min] // x + y + z \n\
710 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
711 minps %[t2], %[min] // record min, restore min \n\
712 add $16, %[byteIndex] // advance loop counter\n\
713 jnz 0b \n\
714 "
715 : [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
716 : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
717 : "memory", "cc");
718 index += localCount / 4;
719#else
720 {
721 for (unsigned int i = 0; i < localCount / 4; i++, index++)
722 { // do four dot products at a time. Carefully avoid touching the w element.
723 float4 v0 = vertices[0];
724 float4 v1 = vertices[1];
725 float4 v2 = vertices[2];
726 float4 v3 = vertices[3];
727 vertices += 4;
728
729 float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
730 float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
731 float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
732 float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
733
734 lo0 = lo0 * vLo;
735 lo1 = lo1 * vLo;
736 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
737 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
738 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
739 z = z * vHi;
740 x = x + y;
741 x = x + z;
742 stack_array[index] = x;
743 min = _mm_min_ps(x, min); // control the order here so that max is never NaN even if x is nan
744 }
745 }
746
747#endif
748 }
749
750 // process the last few points
751 if (count & 3)
752 {
753 float4 v0, v1, v2, x, y, z;
754 switch (count & 3)
755 {
756 case 3:
757 {
758 v0 = vertices[0];
759 v1 = vertices[1];
760 v2 = vertices[2];
761
762 // Calculate 3 dot products, transpose, duplicate v2
763 float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
764 float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
765 lo0 = lo0 * vLo;
766 z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
767 z = z * vHi;
768 float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
769 lo1 = lo1 * vLo;
770 x = _mm_shuffle_ps(lo0, lo1, 0x88);
771 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
772 }
773 break;
774 case 2:
775 {
776 v0 = vertices[0];
777 v1 = vertices[1];
778 float4 xy = _mm_movelh_ps(v0, v1);
779 z = _mm_movehl_ps(v1, v0);
780 xy = xy * vLo;
781 z = _mm_shuffle_ps(z, z, 0xa8);
782 x = _mm_shuffle_ps(xy, xy, 0xa8);
783 y = _mm_shuffle_ps(xy, xy, 0xfd);
784 z = z * vHi;
785 }
786 break;
787 case 1:
788 {
789 float4 xy = vertices[0];
790 z = _mm_shuffle_ps(xy, xy, 0xaa);
791 xy = xy * vLo;
792 z = z * vHi;
793 x = _mm_shuffle_ps(xy, xy, 0);
794 y = _mm_shuffle_ps(xy, xy, 0x55);
795 }
796 break;
797 }
798 x = x + y;
799 x = x + z;
800 stack_array[index] = x;
801 min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
802 index++;
803 }
804
805 // if we found a new min.
806 if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
807 { // we found a new min. Search for it
808 // find min across the min vector, place in all elements of min -- big latency hit here
809 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
810 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
811
812 // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
813 // this where it actually makes a difference is handled in the early out at the top of the function,
814 // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
815 // complexity, and removed it.
816
817 dotmin = min;
818
819 // scan for the first occurence of min in the array
820 size_t test;
821 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
822 {
823 }
824 minIndex = 4 * index + segment + indexTable[test];
825 }
826
828 return minIndex;
829}
830
831#elif defined BT_USE_NEON
832
833#define ARM_NEON_GCC_COMPATIBILITY 1
834#include <arm_neon.h>
835#include <sys/types.h>
836#include <sys/sysctl.h> //for sysctlbyname
837
838static long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
839static long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
840static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
841static long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
842static long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
843static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
844
845long (*_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _maxdot_large_sel;
846long (*_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _mindot_large_sel;
847
848static inline uint32_t btGetCpuCapabilities(void)
849{
850 static uint32_t capabilities = 0;
851 static bool testedCapabilities = false;
852
853 if (0 == testedCapabilities)
854 {
856 size_t featureSize = sizeof(hasFeature);
857 int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
858
859 if (0 == err && hasFeature)
860 capabilities |= 0x2000;
861
862 testedCapabilities = true;
863 }
864
865 return capabilities;
866}
867
868static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
869{
870 if (btGetCpuCapabilities() & 0x2000)
872 else
874
875 return _maxdot_large(vv, vec, count, dotResult);
876}
877
878static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
879{
880 if (btGetCpuCapabilities() & 0x2000)
882 else
884
885 return _mindot_large(vv, vec, count, dotResult);
886}
887
888#if defined __arm__
889#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
890#else
891//support 64bit arm
892#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
893#endif
894
895long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
896{
897 unsigned long i = 0;
903 uint32x2_t indexLo = (uint32x2_t){0, 1};
904 uint32x2_t indexHi = (uint32x2_t){2, 3};
905 uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
906 uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
907 const uint32x2_t four = (uint32x2_t){4, 4};
908
909 for (; i + 8 <= count; i += 8)
910 {
915
920
923 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
924 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
925
928 rLo = vadd_f32(rLo, zLo);
929 rHi = vadd_f32(rHi, zHi);
930
939
944
949
952 zLo = vmul_f32(z0.val[0], vHi);
953 zHi = vmul_f32(z1.val[0], vHi);
954
955 rLo = vpadd_f32(xy0, xy1);
956 rHi = vpadd_f32(xy2, xy3);
957 rLo = vadd_f32(rLo, zLo);
958 rHi = vadd_f32(rHi, zHi);
959
968 }
969
970 for (; i + 4 <= count; i += 4)
971 {
976
981
984 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
985 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
986
989 rLo = vadd_f32(rLo, zLo);
990 rHi = vadd_f32(rHi, zHi);
991
1000 }
1001
1002 switch (count & 3)
1003 {
1004 case 3:
1005 {
1009
1013
1015 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1017
1020 rLo = vadd_f32(rLo, zLo);
1021 rHi = vadd_f32(rHi, zHi);
1022
1029 }
1030 break;
1031 case 2:
1032 {
1035
1038
1040 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1041
1043 rLo = vadd_f32(rLo, zLo);
1044
1048 }
1049 break;
1050 case 1:
1051 {
1057 rLo = vadd_f32(rLo, zLo);
1061 }
1062 break;
1063
1064 default:
1065 break;
1066 }
1067
1068 // select best answer between hi and lo results
1071 iLo = vbsl_u32(mask, iHi, iLo);
1072
1073 // select best answer between even and odd results
1075 iHi = vdup_lane_u32(iLo, 1);
1076 mask = vcgt_f32(dotMaxHi, dotMaxLo);
1078 iLo = vbsl_u32(mask, iHi, iLo);
1079
1081 return vget_lane_u32(iLo, 0);
1082}
1083
1084long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1085{
1089 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1090 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1091 uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1093
1094 unsigned long i = 0;
1095 for (; i + 8 <= count; i += 8)
1096 {
1101
1102 // the next two lines should resolve to a single vswp d, d
1105 // the next two lines should resolve to a single vswp d, d
1108
1109 xy0 = vmulq_f32(xy0, vLo);
1110 xy1 = vmulq_f32(xy1, vLo);
1111
1113 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1115 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1116 x = vaddq_f32(x, z);
1117
1118 uint32x4_t mask = vcgtq_f32(x, maxDot);
1119 maxDot = vbslq_f32(mask, x, maxDot);
1120 index = vbslq_u32(mask, local_index, index);
1122
1127
1128 // the next two lines should resolve to a single vswp d, d
1131 // the next two lines should resolve to a single vswp d, d
1134
1135 xy0 = vmulq_f32(xy0, vLo);
1136 xy1 = vmulq_f32(xy1, vLo);
1137
1138 zb = vuzpq_f32(z0, z1);
1139 z = vmulq_f32(zb.val[0], vHi);
1140 xy = vuzpq_f32(xy0, xy1);
1141 x = vaddq_f32(xy.val[0], xy.val[1]);
1142 x = vaddq_f32(x, z);
1143
1144 mask = vcgtq_f32(x, maxDot);
1145 maxDot = vbslq_f32(mask, x, maxDot);
1146 index = vbslq_u32(mask, local_index, index);
1148 }
1149
1150 for (; i + 4 <= count; i += 4)
1151 {
1156
1157 // the next two lines should resolve to a single vswp d, d
1160 // the next two lines should resolve to a single vswp d, d
1163
1164 xy0 = vmulq_f32(xy0, vLo);
1165 xy1 = vmulq_f32(xy1, vLo);
1166
1168 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1170 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1171 x = vaddq_f32(x, z);
1172
1173 uint32x4_t mask = vcgtq_f32(x, maxDot);
1174 maxDot = vbslq_f32(mask, x, maxDot);
1175 index = vbslq_u32(mask, local_index, index);
1177 }
1178
1179 switch (count & 3)
1180 {
1181 case 3:
1182 {
1186
1187 // the next two lines should resolve to a single vswp d, d
1190 // the next two lines should resolve to a single vswp d, d
1193
1194 xy0 = vmulq_f32(xy0, vLo);
1195 xy1 = vmulq_f32(xy1, vLo);
1196
1198 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1200 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1201 x = vaddq_f32(x, z);
1202
1203 uint32x4_t mask = vcgtq_f32(x, maxDot);
1204 maxDot = vbslq_f32(mask, x, maxDot);
1205 index = vbslq_u32(mask, local_index, index);
1207 }
1208 break;
1209
1210 case 2:
1211 {
1214
1215 // the next two lines should resolve to a single vswp d, d
1217 // the next two lines should resolve to a single vswp d, d
1219
1220 xy0 = vmulq_f32(xy0, vLo);
1221
1223 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1225 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1226 x = vaddq_f32(x, z);
1227
1228 uint32x4_t mask = vcgtq_f32(x, maxDot);
1229 maxDot = vbslq_f32(mask, x, maxDot);
1230 index = vbslq_u32(mask, local_index, index);
1232 }
1233 break;
1234
1235 case 1:
1236 {
1238
1239 // the next two lines should resolve to a single vswp d, d
1241 // the next two lines should resolve to a single vswp d, d
1243
1244 xy0 = vmulq_f32(xy0, vLo);
1245
1246 z = vmulq_f32(z, vHi);
1248 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1249 x = vaddq_f32(x, z);
1250
1251 uint32x4_t mask = vcgtq_f32(x, maxDot);
1252 maxDot = vbslq_f32(mask, x, maxDot);
1253 index = vbslq_u32(mask, local_index, index);
1255 }
1256 break;
1257
1258 default:
1259 break;
1260 }
1261
1262 // select best answer between hi and lo results
1263 uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
1264 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1265 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1266
1267 // select best answer between even and odd results
1270 mask = vcgt_f32(maxDotO, maxDot2);
1271 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1272 index2 = vbsl_u32(mask, indexHi, index2);
1273
1275 return vget_lane_u32(index2, 0);
1276}
1277
1278long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
1279{
1280 unsigned long i = 0;
1286 uint32x2_t indexLo = (uint32x2_t){0, 1};
1287 uint32x2_t indexHi = (uint32x2_t){2, 3};
1288 uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1289 uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1290 const uint32x2_t four = (uint32x2_t){4, 4};
1291
1292 for (; i + 8 <= count; i += 8)
1293 {
1298
1303
1306 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1307 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1308
1311 rLo = vadd_f32(rLo, zLo);
1312 rHi = vadd_f32(rHi, zHi);
1313
1322
1327
1328 xy0 = vmul_f32(vget_low_f32(v0), vLo);
1329 xy1 = vmul_f32(vget_low_f32(v1), vLo);
1330 xy2 = vmul_f32(vget_low_f32(v2), vLo);
1332
1335 zLo = vmul_f32(z0.val[0], vHi);
1336 zHi = vmul_f32(z1.val[0], vHi);
1337
1338 rLo = vpadd_f32(xy0, xy1);
1339 rHi = vpadd_f32(xy2, xy3);
1340 rLo = vadd_f32(rLo, zLo);
1341 rHi = vadd_f32(rHi, zHi);
1342
1351 }
1352
1353 for (; i + 4 <= count; i += 4)
1354 {
1359
1364
1367 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1368 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1369
1372 rLo = vadd_f32(rLo, zLo);
1373 rHi = vadd_f32(rHi, zHi);
1374
1383 }
1384 switch (count & 3)
1385 {
1386 case 3:
1387 {
1391
1395
1397 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1399
1402 rLo = vadd_f32(rLo, zLo);
1403 rHi = vadd_f32(rHi, zHi);
1404
1411 }
1412 break;
1413 case 2:
1414 {
1417
1420
1422 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1423
1425 rLo = vadd_f32(rLo, zLo);
1426
1430 }
1431 break;
1432 case 1:
1433 {
1439 rLo = vadd_f32(rLo, zLo);
1443 }
1444 break;
1445
1446 default:
1447 break;
1448 }
1449
1450 // select best answer between hi and lo results
1453 iLo = vbsl_u32(mask, iHi, iLo);
1454
1455 // select best answer between even and odd results
1457 iHi = vdup_lane_u32(iLo, 1);
1458 mask = vclt_f32(dotMinHi, dotMinLo);
1460 iLo = vbsl_u32(mask, iHi, iLo);
1461
1463 return vget_lane_u32(iLo, 0);
1464}
1465
1466long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1467{
1471 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1472 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1473 uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1475
1476 unsigned long i = 0;
1477 for (; i + 8 <= count; i += 8)
1478 {
1483
1484 // the next two lines should resolve to a single vswp d, d
1487 // the next two lines should resolve to a single vswp d, d
1490
1491 xy0 = vmulq_f32(xy0, vLo);
1492 xy1 = vmulq_f32(xy1, vLo);
1493
1495 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1497 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1498 x = vaddq_f32(x, z);
1499
1500 uint32x4_t mask = vcltq_f32(x, minDot);
1501 minDot = vbslq_f32(mask, x, minDot);
1502 index = vbslq_u32(mask, local_index, index);
1504
1509
1510 // the next two lines should resolve to a single vswp d, d
1513 // the next two lines should resolve to a single vswp d, d
1516
1517 xy0 = vmulq_f32(xy0, vLo);
1518 xy1 = vmulq_f32(xy1, vLo);
1519
1520 zb = vuzpq_f32(z0, z1);
1521 z = vmulq_f32(zb.val[0], vHi);
1522 xy = vuzpq_f32(xy0, xy1);
1523 x = vaddq_f32(xy.val[0], xy.val[1]);
1524 x = vaddq_f32(x, z);
1525
1526 mask = vcltq_f32(x, minDot);
1527 minDot = vbslq_f32(mask, x, minDot);
1528 index = vbslq_u32(mask, local_index, index);
1530 }
1531
1532 for (; i + 4 <= count; i += 4)
1533 {
1538
1539 // the next two lines should resolve to a single vswp d, d
1542 // the next two lines should resolve to a single vswp d, d
1545
1546 xy0 = vmulq_f32(xy0, vLo);
1547 xy1 = vmulq_f32(xy1, vLo);
1548
1550 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1552 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1553 x = vaddq_f32(x, z);
1554
1555 uint32x4_t mask = vcltq_f32(x, minDot);
1556 minDot = vbslq_f32(mask, x, minDot);
1557 index = vbslq_u32(mask, local_index, index);
1559 }
1560
1561 switch (count & 3)
1562 {
1563 case 3:
1564 {
1568
1569 // the next two lines should resolve to a single vswp d, d
1572 // the next two lines should resolve to a single vswp d, d
1575
1576 xy0 = vmulq_f32(xy0, vLo);
1577 xy1 = vmulq_f32(xy1, vLo);
1578
1580 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1582 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1583 x = vaddq_f32(x, z);
1584
1585 uint32x4_t mask = vcltq_f32(x, minDot);
1586 minDot = vbslq_f32(mask, x, minDot);
1587 index = vbslq_u32(mask, local_index, index);
1589 }
1590 break;
1591
1592 case 2:
1593 {
1596
1597 // the next two lines should resolve to a single vswp d, d
1599 // the next two lines should resolve to a single vswp d, d
1601
1602 xy0 = vmulq_f32(xy0, vLo);
1603
1605 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1607 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1608 x = vaddq_f32(x, z);
1609
1610 uint32x4_t mask = vcltq_f32(x, minDot);
1611 minDot = vbslq_f32(mask, x, minDot);
1612 index = vbslq_u32(mask, local_index, index);
1614 }
1615 break;
1616
1617 case 1:
1618 {
1620
1621 // the next two lines should resolve to a single vswp d, d
1623 // the next two lines should resolve to a single vswp d, d
1625
1626 xy0 = vmulq_f32(xy0, vLo);
1627
1628 z = vmulq_f32(z, vHi);
1630 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1631 x = vaddq_f32(x, z);
1632
1633 uint32x4_t mask = vcltq_f32(x, minDot);
1634 minDot = vbslq_f32(mask, x, minDot);
1635 index = vbslq_u32(mask, local_index, index);
1637 }
1638 break;
1639
1640 default:
1641 break;
1642 }
1643
1644 // select best answer between hi and lo results
1645 uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
1646 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1647 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1648
1649 // select best answer between even and odd results
1652 mask = vclt_f32(minDotO, minDot2);
1653 minDot2 = vbsl_f32(mask, minDotO, minDot2);
1654 index2 = vbsl_u32(mask, indexHi, index2);
1655
1657 return vget_lane_u32(index2, 0);
1658}
1659
1660#else
1661#error Unhandled __APPLE__ arch
1662#endif
1663
1664#endif /* __APPLE__ */
unsigned int uint32_t
const T & btMax(const T &a, const T &b)
Definition btMinMax.h:27
#define btUnlikely(_c)
Definition btScalar.h:159
#define BT_INFINITY
Definition btScalar.h:405