18#if defined(_WIN32) || defined(__i386__)
19#define BT_USE_SSE_IN_API
24#if defined BT_USE_SIMD_VECTOR3
32typedef float float4 __attribute__((vector_size(16)));
38#if defined BT_USE_SSE || defined _WIN32
40#define LOG2_ARRAY_SIZE 6
41#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
45long _maxdot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
46long _maxdot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
48 const float4 *vertices = (
const float4 *)vv;
49 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
51 float4 vvec = _mm_loadu_ps(vec);
52 float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));
53 float4 vLo = _mm_movelh_ps(vvec, vvec);
58 float4 stack_array[STACK_ARRAY_COUNT];
67 for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
71 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
73 float4 v0 = vertices[0];
74 float4 v1 = vertices[1];
75 float4 v2 = vertices[2];
76 float4 v3 = vertices[3];
79 float4 lo0 = _mm_movelh_ps(v0, v1);
80 float4 hi0 = _mm_movehl_ps(v1, v0);
81 float4 lo1 = _mm_movelh_ps(v2, v3);
82 float4 hi1 = _mm_movehl_ps(v3, v2);
86 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
87 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
88 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
92 stack_array[index] = x;
93 max = _mm_max_ps(x, max);
101 lo0 = _mm_movelh_ps(v0, v1);
102 hi0 = _mm_movehl_ps(v1, v0);
103 lo1 = _mm_movelh_ps(v2, v3);
104 hi1 = _mm_movehl_ps(v3, v2);
108 z = _mm_shuffle_ps(hi0, hi1, 0x88);
109 x = _mm_shuffle_ps(lo0, lo1, 0x88);
110 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
114 stack_array[index + 1] = x;
115 max = _mm_max_ps(x, max);
123 lo0 = _mm_movelh_ps(v0, v1);
124 hi0 = _mm_movehl_ps(v1, v0);
125 lo1 = _mm_movelh_ps(v2, v3);
126 hi1 = _mm_movehl_ps(v3, v2);
130 z = _mm_shuffle_ps(hi0, hi1, 0x88);
131 x = _mm_shuffle_ps(lo0, lo1, 0x88);
132 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
136 stack_array[index + 2] = x;
137 max = _mm_max_ps(x, max);
145 lo0 = _mm_movelh_ps(v0, v1);
146 hi0 = _mm_movehl_ps(v1, v0);
147 lo1 = _mm_movelh_ps(v2, v3);
148 hi1 = _mm_movehl_ps(v3, v2);
152 z = _mm_shuffle_ps(hi0, hi1, 0x88);
153 x = _mm_shuffle_ps(lo0, lo1, 0x88);
154 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
158 stack_array[index + 3] = x;
159 max = _mm_max_ps(x, max);
165 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
168 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
169 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
175 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)
179 maxIndex = 4 * index + segment + indexTable[test];
192 for (; index + 4 <= count / 4; index += 4)
194 float4 v0 = vertices[0];
195 float4 v1 = vertices[1];
196 float4 v2 = vertices[2];
197 float4 v3 = vertices[3];
200 float4 lo0 = _mm_movelh_ps(v0, v1);
201 float4 hi0 = _mm_movehl_ps(v1, v0);
202 float4 lo1 = _mm_movelh_ps(v2, v3);
203 float4 hi1 = _mm_movehl_ps(v3, v2);
207 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
213 stack_array[index] = x;
214 max = _mm_max_ps(x, max);
222 lo0 = _mm_movelh_ps(v0, v1);
223 hi0 = _mm_movehl_ps(v1, v0);
224 lo1 = _mm_movelh_ps(v2, v3);
225 hi1 = _mm_movehl_ps(v3, v2);
229 z = _mm_shuffle_ps(hi0, hi1, 0x88);
230 x = _mm_shuffle_ps(lo0, lo1, 0x88);
231 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
235 stack_array[index + 1] = x;
236 max = _mm_max_ps(x, max);
244 lo0 = _mm_movelh_ps(v0, v1);
245 hi0 = _mm_movehl_ps(v1, v0);
246 lo1 = _mm_movelh_ps(v2, v3);
247 hi1 = _mm_movehl_ps(v3, v2);
251 z = _mm_shuffle_ps(hi0, hi1, 0x88);
252 x = _mm_shuffle_ps(lo0, lo1, 0x88);
253 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
257 stack_array[index + 2] = x;
258 max = _mm_max_ps(x, max);
266 lo0 = _mm_movelh_ps(v0, v1);
267 hi0 = _mm_movehl_ps(v1, v0);
268 lo1 = _mm_movelh_ps(v2, v3);
269 hi1 = _mm_movehl_ps(v3, v2);
273 z = _mm_shuffle_ps(hi0, hi1, 0x88);
274 x = _mm_shuffle_ps(lo0, lo1, 0x88);
275 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
279 stack_array[index + 3] = x;
280 max = _mm_max_ps(x, max);
286 size_t localCount = (count & -4L) - 4 * index;
290 float4 t0, t1, t2, t3, t4;
291 float4 *sap = &stack_array[index + localCount / 4];
292 vertices += localCount;
293 size_t byteIndex = -(localCount) *
sizeof(
float);
297 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
298 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
299 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
300 movaps %[t0], %[max] // vertices[0] \n\
301 movlhps %[t1], %[max] // x0y0x1y1 \n\
302 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
303 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
304 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
305 movhlps %[t0], %[t1] // z0w0z1w1 \n\
306 movaps %[t3], %[t0] // vertices[2] \n\
307 movlhps %[t4], %[t0] // x2y2x3y3 \n\
308 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
309 movhlps %[t3], %[t4] // z2w2z3w3 \n\
310 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
311 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
312 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
313 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
314 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
315 addps %[t3], %[max] // x + y \n\
316 addps %[t1], %[max] // x + y + z \n\
317 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
318 maxps %[t2], %[max] // record max, restore max \n\
319 add $16, %[byteIndex] // advance loop counter\n\
322 : [max]
"+x"(max), [t0]
"=&x"(t0), [t1]
"=&x"(t1), [t2]
"=&x"(t2), [t3]
"=&x"(t3), [t4]
"=&x"(t4), [byteIndex]
"+r"(byteIndex)
323 : [vLo]
"x"(vLo), [vHi]
"x"(vHi), [vertices]
"r"(vertices), [sap]
"r"(sap)
325 index += localCount / 4;
328 for (
unsigned int i = 0; i < localCount / 4; i++, index++)
330 float4 v0 = vertices[0];
331 float4 v1 = vertices[1];
332 float4 v2 = vertices[2];
333 float4 v3 = vertices[3];
336 float4 lo0 = _mm_movelh_ps(v0, v1);
337 float4 hi0 = _mm_movehl_ps(v1, v0);
338 float4 lo1 = _mm_movelh_ps(v2, v3);
339 float4 hi1 = _mm_movehl_ps(v3, v2);
343 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
344 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
345 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
349 stack_array[index] = x;
350 max = _mm_max_ps(x, max);
359 float4 v0, v1, v2, x, y, z;
369 float4 lo0 = _mm_movelh_ps(v0, v1);
370 float4 hi0 = _mm_movehl_ps(v1, v0);
372 z = _mm_shuffle_ps(hi0, v2, 0xa8);
374 float4 lo1 = _mm_movelh_ps(v2, v2);
376 x = _mm_shuffle_ps(lo0, lo1, 0x88);
377 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
384 float4 xy = _mm_movelh_ps(v0, v1);
385 z = _mm_movehl_ps(v1, v0);
387 z = _mm_shuffle_ps(z, z, 0xa8);
388 x = _mm_shuffle_ps(xy, xy, 0xa8);
389 y = _mm_shuffle_ps(xy, xy, 0xfd);
395 float4 xy = vertices[0];
396 z = _mm_shuffle_ps(xy, xy, 0xaa);
399 x = _mm_shuffle_ps(xy, xy, 0);
400 y = _mm_shuffle_ps(xy, xy, 0x55);
406 stack_array[index] = x;
407 max = _mm_max_ps(x, max);
412 if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
415 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
416 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
427 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)
430 maxIndex = 4 * index + segment + indexTable[test];
433 _mm_store_ss(dotResult, dotMax);
437long _mindot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
439long _mindot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
441 const float4 *vertices = (
const float4 *)vv;
442 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
444 float4 vvec = _mm_loadu_ps(vec);
445 float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));
446 float4 vLo = _mm_movelh_ps(vvec, vvec);
451 float4 stack_array[STACK_ARRAY_COUNT];
460 for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
464 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
466 float4 v0 = vertices[0];
467 float4 v1 = vertices[1];
468 float4 v2 = vertices[2];
469 float4 v3 = vertices[3];
472 float4 lo0 = _mm_movelh_ps(v0, v1);
473 float4 hi0 = _mm_movehl_ps(v1, v0);
474 float4 lo1 = _mm_movelh_ps(v2, v3);
475 float4 hi1 = _mm_movehl_ps(v3, v2);
479 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
480 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
481 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
485 stack_array[index] = x;
486 min = _mm_min_ps(x, min);
494 lo0 = _mm_movelh_ps(v0, v1);
495 hi0 = _mm_movehl_ps(v1, v0);
496 lo1 = _mm_movelh_ps(v2, v3);
497 hi1 = _mm_movehl_ps(v3, v2);
501 z = _mm_shuffle_ps(hi0, hi1, 0x88);
502 x = _mm_shuffle_ps(lo0, lo1, 0x88);
503 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
507 stack_array[index + 1] = x;
508 min = _mm_min_ps(x, min);
516 lo0 = _mm_movelh_ps(v0, v1);
517 hi0 = _mm_movehl_ps(v1, v0);
518 lo1 = _mm_movelh_ps(v2, v3);
519 hi1 = _mm_movehl_ps(v3, v2);
523 z = _mm_shuffle_ps(hi0, hi1, 0x88);
524 x = _mm_shuffle_ps(lo0, lo1, 0x88);
525 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
529 stack_array[index + 2] = x;
530 min = _mm_min_ps(x, min);
538 lo0 = _mm_movelh_ps(v0, v1);
539 hi0 = _mm_movehl_ps(v1, v0);
540 lo1 = _mm_movelh_ps(v2, v3);
541 hi1 = _mm_movehl_ps(v3, v2);
545 z = _mm_shuffle_ps(hi0, hi1, 0x88);
546 x = _mm_shuffle_ps(lo0, lo1, 0x88);
547 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
551 stack_array[index + 3] = x;
552 min = _mm_min_ps(x, min);
558 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
561 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
562 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
568 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)
572 minIndex = 4 * index + segment + indexTable[test];
585 for (; index + 4 <= count / 4; index += 4)
587 float4 v0 = vertices[0];
588 float4 v1 = vertices[1];
589 float4 v2 = vertices[2];
590 float4 v3 = vertices[3];
593 float4 lo0 = _mm_movelh_ps(v0, v1);
594 float4 hi0 = _mm_movehl_ps(v1, v0);
595 float4 lo1 = _mm_movelh_ps(v2, v3);
596 float4 hi1 = _mm_movehl_ps(v3, v2);
600 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
601 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
602 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
606 stack_array[index] = x;
607 min = _mm_min_ps(x, min);
615 lo0 = _mm_movelh_ps(v0, v1);
616 hi0 = _mm_movehl_ps(v1, v0);
617 lo1 = _mm_movelh_ps(v2, v3);
618 hi1 = _mm_movehl_ps(v3, v2);
622 z = _mm_shuffle_ps(hi0, hi1, 0x88);
623 x = _mm_shuffle_ps(lo0, lo1, 0x88);
624 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
628 stack_array[index + 1] = x;
629 min = _mm_min_ps(x, min);
637 lo0 = _mm_movelh_ps(v0, v1);
638 hi0 = _mm_movehl_ps(v1, v0);
639 lo1 = _mm_movelh_ps(v2, v3);
640 hi1 = _mm_movehl_ps(v3, v2);
644 z = _mm_shuffle_ps(hi0, hi1, 0x88);
645 x = _mm_shuffle_ps(lo0, lo1, 0x88);
646 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
650 stack_array[index + 2] = x;
651 min = _mm_min_ps(x, min);
659 lo0 = _mm_movelh_ps(v0, v1);
660 hi0 = _mm_movehl_ps(v1, v0);
661 lo1 = _mm_movelh_ps(v2, v3);
662 hi1 = _mm_movehl_ps(v3, v2);
666 z = _mm_shuffle_ps(hi0, hi1, 0x88);
667 x = _mm_shuffle_ps(lo0, lo1, 0x88);
668 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
672 stack_array[index + 3] = x;
673 min = _mm_min_ps(x, min);
679 size_t localCount = (count & -4L) - 4 * index;
683 vertices += localCount;
684 float4 t0, t1, t2, t3, t4;
685 size_t byteIndex = -(localCount) *
sizeof(
float);
686 float4 *sap = &stack_array[index + localCount / 4];
690 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
691 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
692 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
693 movaps %[t0], %[min] // vertices[0] \n\
694 movlhps %[t1], %[min] // x0y0x1y1 \n\
695 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
696 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
697 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
698 movhlps %[t0], %[t1] // z0w0z1w1 \n\
699 movaps %[t3], %[t0] // vertices[2] \n\
700 movlhps %[t4], %[t0] // x2y2x3y3 \n\
701 movhlps %[t3], %[t4] // z2w2z3w3 \n\
702 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
703 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
704 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
705 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
706 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
707 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
708 addps %[t3], %[min] // x + y \n\
709 addps %[t1], %[min] // x + y + z \n\
710 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
711 minps %[t2], %[min] // record min, restore min \n\
712 add $16, %[byteIndex] // advance loop counter\n\
715 : [min]
"+x"(min), [t0]
"=&x"(t0), [t1]
"=&x"(t1), [t2]
"=&x"(t2), [t3]
"=&x"(t3), [t4]
"=&x"(t4), [byteIndex]
"+r"(byteIndex)
716 : [vLo]
"x"(vLo), [vHi]
"x"(vHi), [vertices]
"r"(vertices), [sap]
"r"(sap)
718 index += localCount / 4;
721 for (
unsigned int i = 0; i < localCount / 4; i++, index++)
723 float4 v0 = vertices[0];
724 float4 v1 = vertices[1];
725 float4 v2 = vertices[2];
726 float4 v3 = vertices[3];
729 float4 lo0 = _mm_movelh_ps(v0, v1);
730 float4 hi0 = _mm_movehl_ps(v1, v0);
731 float4 lo1 = _mm_movelh_ps(v2, v3);
732 float4 hi1 = _mm_movehl_ps(v3, v2);
736 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
737 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
738 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
742 stack_array[index] = x;
743 min = _mm_min_ps(x, min);
753 float4 v0, v1, v2, x, y, z;
763 float4 lo0 = _mm_movelh_ps(v0, v1);
764 float4 hi0 = _mm_movehl_ps(v1, v0);
766 z = _mm_shuffle_ps(hi0, v2, 0xa8);
768 float4 lo1 = _mm_movelh_ps(v2, v2);
770 x = _mm_shuffle_ps(lo0, lo1, 0x88);
771 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
778 float4 xy = _mm_movelh_ps(v0, v1);
779 z = _mm_movehl_ps(v1, v0);
781 z = _mm_shuffle_ps(z, z, 0xa8);
782 x = _mm_shuffle_ps(xy, xy, 0xa8);
783 y = _mm_shuffle_ps(xy, xy, 0xfd);
789 float4 xy = vertices[0];
790 z = _mm_shuffle_ps(xy, xy, 0xaa);
793 x = _mm_shuffle_ps(xy, xy, 0);
794 y = _mm_shuffle_ps(xy, xy, 0x55);
800 stack_array[index] = x;
801 min = _mm_min_ps(x, min);
806 if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
809 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
810 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
821 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)
824 minIndex = 4 * index + segment + indexTable[test];
827 _mm_store_ss(dotResult, dotmin);
831#elif defined BT_USE_NEON
833#define ARM_NEON_GCC_COMPATIBILITY 1
835#include <sys/types.h>
836#include <sys/sysctl.h>
838static long _maxdot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
839static long _maxdot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
840static long _maxdot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
841static long _mindot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
842static long _mindot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
843static long _mindot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
845long (*_maxdot_large)(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult) = _maxdot_large_sel;
846long (*_mindot_large)(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult) = _mindot_large_sel;
848static inline uint32_t btGetCpuCapabilities(
void)
851 static bool testedCapabilities =
false;
853 if (0 == testedCapabilities)
856 size_t featureSize =
sizeof(hasFeature);
857 int err = sysctlbyname(
"hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
859 if (0 == err && hasFeature)
860 capabilities |= 0x2000;
862 testedCapabilities =
true;
868static long _maxdot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
870 if (btGetCpuCapabilities() & 0x2000)
871 _maxdot_large = _maxdot_large_v1;
873 _maxdot_large = _maxdot_large_v0;
875 return _maxdot_large(vv, vec, count, dotResult);
878static long _mindot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
880 if (btGetCpuCapabilities() & 0x2000)
881 _mindot_large = _mindot_large_v1;
883 _mindot_large = _mindot_large_v0;
885 return _mindot_large(vv, vec, count, dotResult);
889#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) );
_r; })
892#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); _r; })
895long _maxdot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
898 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
899 float32x2_t vLo = vget_low_f32(vvec);
900 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
903 uint32x2_t indexLo = (uint32x2_t){0, 1};
904 uint32x2_t indexHi = (uint32x2_t){2, 3};
905 uint32x2_t iLo = (uint32x2_t){
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1)};
906 uint32x2_t iHi = (uint32x2_t){
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1)};
907 const uint32x2_t four = (uint32x2_t){4, 4};
909 for (; i + 8 <= count; i += 8)
911 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
912 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
913 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
914 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
916 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
917 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
918 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
919 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
921 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
922 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
923 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
924 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
926 float32x2_t rLo = vpadd_f32(xy0, xy1);
927 float32x2_t rHi = vpadd_f32(xy2, xy3);
928 rLo = vadd_f32(rLo, zLo);
929 rHi = vadd_f32(rHi, zHi);
931 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
932 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
933 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
934 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
935 iLo = vbsl_u32(maskLo, indexLo, iLo);
936 iHi = vbsl_u32(maskHi, indexHi, iHi);
937 indexLo = vadd_u32(indexLo, four);
938 indexHi = vadd_u32(indexHi, four);
940 v0 = vld1q_f32_aligned_postincrement(vv);
941 v1 = vld1q_f32_aligned_postincrement(vv);
942 v2 = vld1q_f32_aligned_postincrement(vv);
943 v3 = vld1q_f32_aligned_postincrement(vv);
945 xy0 = vmul_f32(vget_low_f32(v0), vLo);
946 xy1 = vmul_f32(vget_low_f32(v1), vLo);
947 xy2 = vmul_f32(vget_low_f32(v2), vLo);
948 xy3 = vmul_f32(vget_low_f32(v3), vLo);
950 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
951 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
952 zLo = vmul_f32(z0.val[0], vHi);
953 zHi = vmul_f32(z1.val[0], vHi);
955 rLo = vpadd_f32(xy0, xy1);
956 rHi = vpadd_f32(xy2, xy3);
957 rLo = vadd_f32(rLo, zLo);
958 rHi = vadd_f32(rHi, zHi);
960 maskLo = vcgt_f32(rLo, dotMaxLo);
961 maskHi = vcgt_f32(rHi, dotMaxHi);
962 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
963 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
964 iLo = vbsl_u32(maskLo, indexLo, iLo);
965 iHi = vbsl_u32(maskHi, indexHi, iHi);
966 indexLo = vadd_u32(indexLo, four);
967 indexHi = vadd_u32(indexHi, four);
970 for (; i + 4 <= count; i += 4)
972 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
973 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
974 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
975 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
977 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
978 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
979 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
980 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
982 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
983 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
984 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
985 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
987 float32x2_t rLo = vpadd_f32(xy0, xy1);
988 float32x2_t rHi = vpadd_f32(xy2, xy3);
989 rLo = vadd_f32(rLo, zLo);
990 rHi = vadd_f32(rHi, zHi);
992 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
993 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
994 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
995 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
996 iLo = vbsl_u32(maskLo, indexLo, iLo);
997 iHi = vbsl_u32(maskHi, indexHi, iHi);
998 indexLo = vadd_u32(indexLo, four);
999 indexHi = vadd_u32(indexHi, four);
1006 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1007 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1008 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1010 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1011 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1012 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1014 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1015 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1016 float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1018 float32x2_t rLo = vpadd_f32(xy0, xy1);
1019 float32x2_t rHi = vpadd_f32(xy2, xy2);
1020 rLo = vadd_f32(rLo, zLo);
1021 rHi = vadd_f32(rHi, zHi);
1023 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1024 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
1025 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1026 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
1027 iLo = vbsl_u32(maskLo, indexLo, iLo);
1028 iHi = vbsl_u32(maskHi, indexHi, iHi);
1033 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1034 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1036 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1037 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1039 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1040 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1042 float32x2_t rLo = vpadd_f32(xy0, xy1);
1043 rLo = vadd_f32(rLo, zLo);
1045 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1046 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1047 iLo = vbsl_u32(maskLo, indexLo, iLo);
1052 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1053 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1054 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1055 float32x2_t zLo = vmul_f32(z0, vHi);
1056 float32x2_t rLo = vpadd_f32(xy0, xy0);
1057 rLo = vadd_f32(rLo, zLo);
1058 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1059 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1060 iLo = vbsl_u32(maskLo, indexLo, iLo);
1069 uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
1070 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1071 iLo = vbsl_u32(mask, iHi, iLo);
1074 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1075 iHi = vdup_lane_u32(iLo, 1);
1076 mask = vcgt_f32(dotMaxHi, dotMaxLo);
1077 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1078 iLo = vbsl_u32(mask, iHi, iLo);
1080 *dotResult = vget_lane_f32(dotMaxLo, 0);
1081 return vget_lane_u32(iLo, 0);
1084long _maxdot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
1086 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1087 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1088 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1089 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1090 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1094 unsigned long i = 0;
1095 for (; i + 8 <= count; i += 8)
1097 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1098 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1099 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1100 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1103 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1104 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1106 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1107 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1109 xy0 = vmulq_f32(xy0, vLo);
1110 xy1 = vmulq_f32(xy1, vLo);
1112 float32x4x2_t zb = vuzpq_f32(z0, z1);
1113 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1114 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1115 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1116 x = vaddq_f32(x, z);
1118 uint32x4_t mask = vcgtq_f32(x, maxDot);
1119 maxDot = vbslq_f32(mask, x, maxDot);
1120 index = vbslq_u32(mask, local_index, index);
1121 local_index = vaddq_u32(local_index, four);
1123 v0 = vld1q_f32_aligned_postincrement(vv);
1124 v1 = vld1q_f32_aligned_postincrement(vv);
1125 v2 = vld1q_f32_aligned_postincrement(vv);
1126 v3 = vld1q_f32_aligned_postincrement(vv);
1129 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1130 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1132 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1133 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1135 xy0 = vmulq_f32(xy0, vLo);
1136 xy1 = vmulq_f32(xy1, vLo);
1138 zb = vuzpq_f32(z0, z1);
1139 z = vmulq_f32(zb.val[0], vHi);
1140 xy = vuzpq_f32(xy0, xy1);
1141 x = vaddq_f32(xy.val[0], xy.val[1]);
1142 x = vaddq_f32(x, z);
1144 mask = vcgtq_f32(x, maxDot);
1145 maxDot = vbslq_f32(mask, x, maxDot);
1146 index = vbslq_u32(mask, local_index, index);
1147 local_index = vaddq_u32(local_index, four);
1150 for (; i + 4 <= count; i += 4)
1152 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1153 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1154 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1155 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1158 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1159 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1161 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1162 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1164 xy0 = vmulq_f32(xy0, vLo);
1165 xy1 = vmulq_f32(xy1, vLo);
1167 float32x4x2_t zb = vuzpq_f32(z0, z1);
1168 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1169 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1170 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1171 x = vaddq_f32(x, z);
1173 uint32x4_t mask = vcgtq_f32(x, maxDot);
1174 maxDot = vbslq_f32(mask, x, maxDot);
1175 index = vbslq_u32(mask, local_index, index);
1176 local_index = vaddq_u32(local_index, four);
1183 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1184 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1185 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1188 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1189 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1191 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1192 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1194 xy0 = vmulq_f32(xy0, vLo);
1195 xy1 = vmulq_f32(xy1, vLo);
1197 float32x4x2_t zb = vuzpq_f32(z0, z1);
1198 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1199 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1200 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1201 x = vaddq_f32(x, z);
1203 uint32x4_t mask = vcgtq_f32(x, maxDot);
1204 maxDot = vbslq_f32(mask, x, maxDot);
1205 index = vbslq_u32(mask, local_index, index);
1206 local_index = vaddq_u32(local_index, four);
1212 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1213 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1216 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1218 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1220 xy0 = vmulq_f32(xy0, vLo);
1222 float32x4x2_t zb = vuzpq_f32(z0, z0);
1223 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1224 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1225 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1226 x = vaddq_f32(x, z);
1228 uint32x4_t mask = vcgtq_f32(x, maxDot);
1229 maxDot = vbslq_f32(mask, x, maxDot);
1230 index = vbslq_u32(mask, local_index, index);
1231 local_index = vaddq_u32(local_index, four);
1237 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1240 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1242 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1244 xy0 = vmulq_f32(xy0, vLo);
1246 z = vmulq_f32(z, vHi);
1247 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1248 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1249 x = vaddq_f32(x, z);
1251 uint32x4_t mask = vcgtq_f32(x, maxDot);
1252 maxDot = vbslq_f32(mask, x, maxDot);
1253 index = vbslq_u32(mask, local_index, index);
1254 local_index = vaddq_u32(local_index, four);
1263 uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
1264 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1265 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1268 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1269 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1270 mask = vcgt_f32(maxDotO, maxDot2);
1271 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1272 index2 = vbsl_u32(mask, indexHi, index2);
1274 *dotResult = vget_lane_f32(maxDot2, 0);
1275 return vget_lane_u32(index2, 0);
1278long _mindot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
1280 unsigned long i = 0;
1281 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1282 float32x2_t vLo = vget_low_f32(vvec);
1283 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1286 uint32x2_t indexLo = (uint32x2_t){0, 1};
1287 uint32x2_t indexHi = (uint32x2_t){2, 3};
1288 uint32x2_t iLo = (uint32x2_t){
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1)};
1289 uint32x2_t iHi = (uint32x2_t){
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1)};
1290 const uint32x2_t four = (uint32x2_t){4, 4};
1292 for (; i + 8 <= count; i += 8)
1294 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1295 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1296 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1297 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1299 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1300 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1301 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1302 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1304 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1305 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1306 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1307 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1309 float32x2_t rLo = vpadd_f32(xy0, xy1);
1310 float32x2_t rHi = vpadd_f32(xy2, xy3);
1311 rLo = vadd_f32(rLo, zLo);
1312 rHi = vadd_f32(rHi, zHi);
1314 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1315 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1316 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1317 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1318 iLo = vbsl_u32(maskLo, indexLo, iLo);
1319 iHi = vbsl_u32(maskHi, indexHi, iHi);
1320 indexLo = vadd_u32(indexLo, four);
1321 indexHi = vadd_u32(indexHi, four);
1323 v0 = vld1q_f32_aligned_postincrement(vv);
1324 v1 = vld1q_f32_aligned_postincrement(vv);
1325 v2 = vld1q_f32_aligned_postincrement(vv);
1326 v3 = vld1q_f32_aligned_postincrement(vv);
1328 xy0 = vmul_f32(vget_low_f32(v0), vLo);
1329 xy1 = vmul_f32(vget_low_f32(v1), vLo);
1330 xy2 = vmul_f32(vget_low_f32(v2), vLo);
1331 xy3 = vmul_f32(vget_low_f32(v3), vLo);
1333 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1334 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1335 zLo = vmul_f32(z0.val[0], vHi);
1336 zHi = vmul_f32(z1.val[0], vHi);
1338 rLo = vpadd_f32(xy0, xy1);
1339 rHi = vpadd_f32(xy2, xy3);
1340 rLo = vadd_f32(rLo, zLo);
1341 rHi = vadd_f32(rHi, zHi);
1343 maskLo = vclt_f32(rLo, dotMinLo);
1344 maskHi = vclt_f32(rHi, dotMinHi);
1345 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1346 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1347 iLo = vbsl_u32(maskLo, indexLo, iLo);
1348 iHi = vbsl_u32(maskHi, indexHi, iHi);
1349 indexLo = vadd_u32(indexLo, four);
1350 indexHi = vadd_u32(indexHi, four);
1353 for (; i + 4 <= count; i += 4)
1355 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1356 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1357 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1358 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1360 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1361 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1362 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1363 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1365 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1366 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1367 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1368 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1370 float32x2_t rLo = vpadd_f32(xy0, xy1);
1371 float32x2_t rHi = vpadd_f32(xy2, xy3);
1372 rLo = vadd_f32(rLo, zLo);
1373 rHi = vadd_f32(rHi, zHi);
1375 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1376 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1377 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1378 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1379 iLo = vbsl_u32(maskLo, indexLo, iLo);
1380 iHi = vbsl_u32(maskHi, indexHi, iHi);
1381 indexLo = vadd_u32(indexLo, four);
1382 indexHi = vadd_u32(indexHi, four);
1388 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1389 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1390 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1392 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1393 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1394 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1396 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1397 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1398 float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1400 float32x2_t rLo = vpadd_f32(xy0, xy1);
1401 float32x2_t rHi = vpadd_f32(xy2, xy2);
1402 rLo = vadd_f32(rLo, zLo);
1403 rHi = vadd_f32(rHi, zHi);
1405 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1406 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1407 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1408 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1409 iLo = vbsl_u32(maskLo, indexLo, iLo);
1410 iHi = vbsl_u32(maskHi, indexHi, iHi);
1415 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1416 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1418 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1419 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1421 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1422 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1424 float32x2_t rLo = vpadd_f32(xy0, xy1);
1425 rLo = vadd_f32(rLo, zLo);
1427 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1428 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1429 iLo = vbsl_u32(maskLo, indexLo, iLo);
1434 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1435 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1436 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1437 float32x2_t zLo = vmul_f32(z0, vHi);
1438 float32x2_t rLo = vpadd_f32(xy0, xy0);
1439 rLo = vadd_f32(rLo, zLo);
1440 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1441 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1442 iLo = vbsl_u32(maskLo, indexLo, iLo);
1451 uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
1452 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1453 iLo = vbsl_u32(mask, iHi, iLo);
1456 dotMinHi = vdup_lane_f32(dotMinLo, 1);
1457 iHi = vdup_lane_u32(iLo, 1);
1458 mask = vclt_f32(dotMinHi, dotMinLo);
1459 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1460 iLo = vbsl_u32(mask, iHi, iLo);
1462 *dotResult = vget_lane_f32(dotMinLo, 0);
1463 return vget_lane_u32(iLo, 0);
1466long _mindot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
1468 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1469 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1470 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1471 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1472 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1476 unsigned long i = 0;
1477 for (; i + 8 <= count; i += 8)
1479 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1480 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1481 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1482 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1485 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1486 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1488 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1489 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1491 xy0 = vmulq_f32(xy0, vLo);
1492 xy1 = vmulq_f32(xy1, vLo);
1494 float32x4x2_t zb = vuzpq_f32(z0, z1);
1495 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1496 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1497 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1498 x = vaddq_f32(x, z);
1500 uint32x4_t mask = vcltq_f32(x, minDot);
1501 minDot = vbslq_f32(mask, x, minDot);
1502 index = vbslq_u32(mask, local_index, index);
1503 local_index = vaddq_u32(local_index, four);
1505 v0 = vld1q_f32_aligned_postincrement(vv);
1506 v1 = vld1q_f32_aligned_postincrement(vv);
1507 v2 = vld1q_f32_aligned_postincrement(vv);
1508 v3 = vld1q_f32_aligned_postincrement(vv);
1511 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1512 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1514 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1515 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1517 xy0 = vmulq_f32(xy0, vLo);
1518 xy1 = vmulq_f32(xy1, vLo);
1520 zb = vuzpq_f32(z0, z1);
1521 z = vmulq_f32(zb.val[0], vHi);
1522 xy = vuzpq_f32(xy0, xy1);
1523 x = vaddq_f32(xy.val[0], xy.val[1]);
1524 x = vaddq_f32(x, z);
1526 mask = vcltq_f32(x, minDot);
1527 minDot = vbslq_f32(mask, x, minDot);
1528 index = vbslq_u32(mask, local_index, index);
1529 local_index = vaddq_u32(local_index, four);
1532 for (; i + 4 <= count; i += 4)
1534 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1535 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1536 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1537 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1540 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1541 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1543 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1544 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1546 xy0 = vmulq_f32(xy0, vLo);
1547 xy1 = vmulq_f32(xy1, vLo);
1549 float32x4x2_t zb = vuzpq_f32(z0, z1);
1550 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1551 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1552 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1553 x = vaddq_f32(x, z);
1555 uint32x4_t mask = vcltq_f32(x, minDot);
1556 minDot = vbslq_f32(mask, x, minDot);
1557 index = vbslq_u32(mask, local_index, index);
1558 local_index = vaddq_u32(local_index, four);
1565 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1566 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1567 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1570 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1571 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1573 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1574 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1576 xy0 = vmulq_f32(xy0, vLo);
1577 xy1 = vmulq_f32(xy1, vLo);
1579 float32x4x2_t zb = vuzpq_f32(z0, z1);
1580 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1581 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1582 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1583 x = vaddq_f32(x, z);
1585 uint32x4_t mask = vcltq_f32(x, minDot);
1586 minDot = vbslq_f32(mask, x, minDot);
1587 index = vbslq_u32(mask, local_index, index);
1588 local_index = vaddq_u32(local_index, four);
1594 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1595 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1598 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1600 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1602 xy0 = vmulq_f32(xy0, vLo);
1604 float32x4x2_t zb = vuzpq_f32(z0, z0);
1605 float32x4_t z = vmulq_f32(zb.val[0], vHi);
1606 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1607 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1608 x = vaddq_f32(x, z);
1610 uint32x4_t mask = vcltq_f32(x, minDot);
1611 minDot = vbslq_f32(mask, x, minDot);
1612 index = vbslq_u32(mask, local_index, index);
1613 local_index = vaddq_u32(local_index, four);
1619 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1622 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1624 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1626 xy0 = vmulq_f32(xy0, vLo);
1628 z = vmulq_f32(z, vHi);
1629 float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1630 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1631 x = vaddq_f32(x, z);
1633 uint32x4_t mask = vcltq_f32(x, minDot);
1634 minDot = vbslq_f32(mask, x, minDot);
1635 index = vbslq_u32(mask, local_index, index);
1636 local_index = vaddq_u32(local_index, four);
1645 uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
1646 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1647 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1650 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1651 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1652 mask = vclt_f32(minDotO, minDot2);
1653 minDot2 = vbsl_f32(mask, minDotO, minDot2);
1654 index2 = vbsl_u32(mask, indexHi, index2);
1656 *dotResult = vget_lane_f32(minDot2, 0);
1657 return vget_lane_u32(index2, 0);
1661#error Unhandled __APPLE__ arch