30 #ifndef GDALSSE_PRIV_H_INCLUDED 31 #define GDALSSE_PRIV_H_INCLUDED 39 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION) 42 #include <emmintrin.h> 46 #include <smmintrin.h> 49 #include "gdal_priv_templates.hpp" 51 static inline __m128i GDALCopyInt16ToXMM(
const void* ptr)
53 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS 56 return _mm_cvtsi32_si128(s);
58 return _mm_cvtsi32_si128(*static_cast<const unsigned short*>(ptr));
62 static inline __m128i GDALCopyInt32ToXMM(
const void* ptr)
64 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS 67 return _mm_cvtsi32_si128(i);
69 return _mm_cvtsi32_si128(*static_cast<const GInt32*>(ptr));
73 static inline __m128i GDALCopyInt64ToXMM(
const void* ptr)
75 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS 78 return _mm_cvtsi64_si128(i);
80 return _mm_cvtsi64_si128(*static_cast<const GInt64*>(ptr));
84 static inline void GDALCopyXMMToInt16(
const __m128i xmm,
void* pDest)
86 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS 87 GInt16 i =
static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
90 *
static_cast<GInt16*
>(pDest) = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
100 #pragma GCC diagnostic push 101 #pragma GCC diagnostic ignored "-Weffc++" 104 XMMReg2Double() =
default;
105 #if defined(__GNUC__) 106 #pragma GCC diagnostic pop 109 XMMReg2Double(
double val): xmm(_mm_load_sd (&val)) {}
110 XMMReg2Double(
const XMMReg2Double& other) : xmm(other.xmm) {}
112 static inline XMMReg2Double Zero()
119 static inline XMMReg2Double Load1ValHighAndLow(
const double* ptr)
122 reg.nsLoad1ValHighAndLow(ptr);
126 static inline XMMReg2Double Load2Val(
const double* ptr)
133 static inline XMMReg2Double Load2Val(
const float* ptr)
140 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
143 reg.nsLoad2ValAligned(ptr);
147 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
154 static inline XMMReg2Double Load2Val(
const short* ptr)
161 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
168 static inline XMMReg2Double Equals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
171 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
175 static inline XMMReg2Double NotEquals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
178 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
182 static inline XMMReg2Double Greater(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
185 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
189 static inline XMMReg2Double And(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
192 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
196 static inline XMMReg2Double Ternary(
const XMMReg2Double& cond,
const XMMReg2Double& true_expr,
const XMMReg2Double& false_expr)
199 reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
203 static inline XMMReg2Double Min(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
206 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
210 inline void nsLoad1ValHighAndLow(
const double* ptr)
212 xmm = _mm_load1_pd(ptr);
215 inline void nsLoad2Val(
const double* ptr)
217 xmm = _mm_loadu_pd(ptr);
220 inline void nsLoad2ValAligned(
const double* ptr)
222 xmm = _mm_load_pd(ptr);
225 inline void nsLoad2Val(
const float* ptr)
227 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
230 inline void nsLoad2Val(
const unsigned char* ptr)
232 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
234 xmm_i = _mm_cvtepu8_epi32(xmm_i);
236 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
237 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
239 xmm = _mm_cvtepi32_pd(xmm_i);
242 inline void nsLoad2Val(
const short* ptr)
244 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
246 xmm_i = _mm_cvtepi16_epi32(xmm_i);
248 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i);
249 xmm_i = _mm_srai_epi32(xmm_i, 16);
251 xmm = _mm_cvtepi32_pd(xmm_i);
254 inline void nsLoad2Val(
const unsigned short* ptr)
256 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
258 xmm_i = _mm_cvtepu16_epi32(xmm_i);
260 xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128());
262 xmm = _mm_cvtepi32_pd(xmm_i);
265 static inline void Load4Val(
const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
267 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
269 xmm_i = _mm_cvtepu8_epi32(xmm_i);
271 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
272 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
274 low.xmm = _mm_cvtepi32_pd(xmm_i);
275 high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
278 static inline void Load4Val(
const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
281 high.nsLoad2Val(ptr+2);
284 static inline void Load4Val(
const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
287 high.nsLoad2Val(ptr+2);
290 static inline void Load4Val(
const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
293 high.nsLoad2Val(ptr+2);
296 static inline void Load4Val(
const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
298 __m128 temp1 = _mm_loadu_ps(ptr);
299 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
300 low.xmm = _mm_cvtps_pd(temp1);
301 high.xmm = _mm_cvtps_pd(temp2);
304 inline void Zeroize()
306 xmm = _mm_setzero_pd();
309 inline XMMReg2Double& operator= (
const XMMReg2Double& other)
315 inline XMMReg2Double& operator+= (
const XMMReg2Double& other)
317 xmm = _mm_add_pd(xmm, other.xmm);
321 inline XMMReg2Double& operator*= (
const XMMReg2Double& other)
323 xmm = _mm_mul_pd(xmm, other.xmm);
327 inline XMMReg2Double operator+ (
const XMMReg2Double& other)
const 330 ret.xmm = _mm_add_pd(xmm, other.xmm);
334 inline XMMReg2Double operator- (
const XMMReg2Double& other)
const 337 ret.xmm = _mm_sub_pd(xmm, other.xmm);
341 inline XMMReg2Double operator* (
const XMMReg2Double& other)
const 344 ret.xmm = _mm_mul_pd(xmm, other.xmm);
348 inline XMMReg2Double operator/ (
const XMMReg2Double& other)
const 351 ret.xmm = _mm_div_pd(xmm, other.xmm);
355 inline double GetHorizSum()
const 358 xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1));
359 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
362 inline void Store2Val(
double* ptr)
const 364 _mm_storeu_pd(ptr, xmm);
367 inline void Store2ValAligned(
double* ptr)
const 369 _mm_store_pd(ptr, xmm);
372 inline void Store2Val(
float* ptr)
const 374 __m128i xmm_i = _mm_castps_si128( _mm_cvtpd_ps(xmm) );
375 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
378 inline void Store2Val(
unsigned char* ptr)
const 380 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5)));
383 tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
384 tmp = _mm_packus_epi16(tmp, tmp);
385 GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16*>(ptr));
388 inline void Store2Val(
unsigned short* ptr)
const 390 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5)));
393 tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
394 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));
400 inline void StoreMask(
unsigned char* ptr)
const 402 _mm_storeu_si128( reinterpret_cast<__m128i*>(ptr), _mm_castpd_si128(xmm) );
405 inline operator double ()
const 407 return _mm_cvtsd_f64(xmm);
413 #warning "Software emulation of SSE2 !" 422 XMMReg2Double(
double val) { low = val; high = 0.0; }
423 XMMReg2Double(
const XMMReg2Double& other) : low(other.low), high(other.high) {}
425 static inline XMMReg2Double Zero()
432 static inline XMMReg2Double Load1ValHighAndLow(
const double* ptr)
435 reg.nsLoad1ValHighAndLow(ptr);
439 static inline XMMReg2Double Equals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
443 if (expr1.low == expr2.low)
444 memset(&(reg.low), 0xFF,
sizeof(
double));
448 if (expr1.high == expr2.high)
449 memset(&(reg.high), 0xFF,
sizeof(
double));
456 static inline XMMReg2Double NotEquals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
460 if (expr1.low != expr2.low)
461 memset(&(reg.low), 0xFF,
sizeof(
double));
465 if (expr1.high != expr2.high)
466 memset(&(reg.high), 0xFF,
sizeof(
double));
473 static inline XMMReg2Double Greater(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
477 if (expr1.low > expr2.low)
478 memset(&(reg.low), 0xFF,
sizeof(
double));
482 if (expr1.high > expr2.high)
483 memset(&(reg.high), 0xFF,
sizeof(
double));
490 static inline XMMReg2Double And(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
493 int low1[2], high1[2];
494 int low2[2], high2[2];
495 memcpy(low1, &expr1.low,
sizeof(
double));
496 memcpy(high1, &expr1.high,
sizeof(
double));
497 memcpy(low2, &expr2.low,
sizeof(
double));
498 memcpy(high2, &expr2.high,
sizeof(
double));
501 high1[0] &= high2[0];
502 high1[1] &= high2[1];
503 memcpy(®.low, low1,
sizeof(
double));
504 memcpy(®.high, high1,
sizeof(
double));
508 static inline XMMReg2Double Ternary(
const XMMReg2Double& cond,
const XMMReg2Double& true_expr,
const XMMReg2Double& false_expr)
512 reg.low = true_expr.low;
514 reg.low = false_expr.low;
516 reg.high = true_expr.high;
518 reg.high = false_expr.high;
522 static inline XMMReg2Double Min(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
525 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
526 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
530 static inline XMMReg2Double Load2Val(
const double* ptr)
537 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
540 reg.nsLoad2ValAligned(ptr);
544 static inline XMMReg2Double Load2Val(
const float* ptr)
551 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
558 static inline XMMReg2Double Load2Val(
const short* ptr)
565 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
572 inline void nsLoad1ValHighAndLow(
const double* ptr)
578 inline void nsLoad2Val(
const double* ptr)
584 inline void nsLoad2ValAligned(
const double* ptr)
590 inline void nsLoad2Val(
const float* ptr)
596 inline void nsLoad2Val(
const unsigned char* ptr)
602 inline void nsLoad2Val(
const short* ptr)
608 inline void nsLoad2Val(
const unsigned short* ptr)
614 static inline void Load4Val(
const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
622 static inline void Load4Val(
const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
625 high.nsLoad2Val(ptr+2);
628 static inline void Load4Val(
const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
631 high.nsLoad2Val(ptr+2);
634 static inline void Load4Val(
const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
637 high.nsLoad2Val(ptr+2);
640 static inline void Load4Val(
const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
643 high.nsLoad2Val(ptr+2);
646 inline void Zeroize()
652 inline XMMReg2Double& operator= (
const XMMReg2Double& other)
659 inline XMMReg2Double& operator+= (
const XMMReg2Double& other)
666 inline XMMReg2Double& operator*= (
const XMMReg2Double& other)
673 inline XMMReg2Double operator+ (
const XMMReg2Double& other)
const 676 ret.low = low + other.low;
677 ret.high = high + other.high;
681 inline XMMReg2Double operator- (
const XMMReg2Double& other)
const 684 ret.low = low - other.low;
685 ret.high = high - other.high;
689 inline XMMReg2Double operator* (
const XMMReg2Double& other)
const 692 ret.low = low * other.low;
693 ret.high = high * other.high;
697 inline XMMReg2Double operator/ (
const XMMReg2Double& other)
const 700 ret.low = low / other.low;
701 ret.high = high / other.high;
705 inline double GetHorizSum()
const 710 inline void Store2Val(
double* ptr)
const 716 inline void Store2ValAligned(
double* ptr)
const 722 inline void Store2Val(
float* ptr)
const 728 void Store2Val(
unsigned char* ptr)
const 730 ptr[0] = (
unsigned char)(low + 0.5);
731 ptr[1] = (
unsigned char)(high + 0.5);
734 void Store2Val(
unsigned short* ptr)
const 737 ptr[1] = (
GUInt16)(high + 0.5);
740 inline void StoreMask(
unsigned char* ptr)
const 742 memcpy(ptr, &low, 8);
743 memcpy(ptr + 8, &high, 8);
746 inline operator double ()
const 756 #include <immintrin.h> 764 XMMReg4Double(
const XMMReg4Double& other) : ymm(other.ymm) {}
766 static inline XMMReg4Double Zero()
773 inline void Zeroize()
775 ymm = _mm256_setzero_pd();
778 static inline XMMReg4Double Load1ValHighAndLow(
const double* ptr)
781 reg.nsLoad1ValHighAndLow(ptr);
785 inline void nsLoad1ValHighAndLow(
const double* ptr)
787 ymm = _mm256_set1_pd(*ptr);
790 static inline XMMReg4Double Load4Val(
const unsigned char* ptr)
797 inline void nsLoad4Val(
const unsigned char* ptr)
799 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
800 xmm_i = _mm_cvtepu8_epi32(xmm_i);
801 ymm = _mm256_cvtepi32_pd(xmm_i);
804 static inline XMMReg4Double Load4Val(
const short* ptr)
811 inline void nsLoad4Val(
const short* ptr)
813 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
814 xmm_i = _mm_cvtepi16_epi32(xmm_i);
815 ymm = _mm256_cvtepi32_pd(xmm_i);
818 static inline XMMReg4Double Load4Val(
const unsigned short* ptr)
825 inline void nsLoad4Val(
const unsigned short* ptr)
827 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
828 xmm_i = _mm_cvtepu16_epi32(xmm_i);
829 ymm = _mm256_cvtepi32_pd(xmm_i);
832 static inline XMMReg4Double Load4Val(
const double* ptr)
839 inline void nsLoad4Val(
const double* ptr)
841 ymm = _mm256_loadu_pd(ptr);
844 static inline XMMReg4Double Load4ValAligned(
const double* ptr)
847 reg.nsLoad4ValAligned(ptr);
851 inline void nsLoad4ValAligned(
const double* ptr)
853 ymm = _mm256_load_pd(ptr);
856 static inline XMMReg4Double Load4Val(
const float* ptr)
863 inline void nsLoad4Val(
const float* ptr)
865 ymm = _mm256_cvtps_pd( _mm_loadu_ps(ptr) );
868 static inline XMMReg4Double Equals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
871 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
875 static inline XMMReg4Double NotEquals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
878 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
882 static inline XMMReg4Double Greater(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
885 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
889 static inline XMMReg4Double And(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
892 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
896 static inline XMMReg4Double Ternary(
const XMMReg4Double& cond,
const XMMReg4Double& true_expr,
const XMMReg4Double& false_expr)
899 reg.ymm = _mm256_or_pd(_mm256_and_pd (cond.ymm, true_expr.ymm), _mm256_andnot_pd(cond.ymm, false_expr.ymm));
903 static inline XMMReg4Double Min(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
906 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
910 inline XMMReg4Double& operator= (
const XMMReg4Double& other)
916 inline XMMReg4Double& operator+= (
const XMMReg4Double& other)
918 ymm = _mm256_add_pd(ymm, other.ymm);
922 inline XMMReg4Double& operator*= (
const XMMReg4Double& other)
924 ymm = _mm256_mul_pd(ymm, other.ymm);
928 inline XMMReg4Double operator+ (
const XMMReg4Double& other)
const 931 ret.ymm = _mm256_add_pd(ymm, other.ymm);
935 inline XMMReg4Double operator- (
const XMMReg4Double& other)
const 938 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
942 inline XMMReg4Double operator* (
const XMMReg4Double& other)
const 945 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
949 inline XMMReg4Double operator/ (
const XMMReg4Double& other)
const 952 ret.ymm = _mm256_div_pd(ymm, other.ymm);
956 void AddToLow(
const XMMReg2Double& other )
958 __m256d ymm2 = _mm256_setzero_pd();
959 ymm2 = _mm256_insertf128_pd( ymm2, other.xmm, 0);
960 ymm = _mm256_add_pd(ymm, ymm2);
963 inline double GetHorizSum()
const 965 __m256d ymm_tmp1, ymm_tmp2;
966 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
967 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
968 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
969 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
972 inline void Store4Val(
unsigned char* ptr)
const 974 __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
977 xmm_i = _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
978 GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32*>(ptr));
981 inline void Store4Val(
unsigned short* ptr)
const 983 __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
984 xmm_i = _mm_packus_epi32(xmm_i, xmm_i);
985 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
988 inline void Store4Val(
float* ptr)
const 990 _mm_storeu_ps(ptr, _mm256_cvtpd_ps (ymm));
993 inline void Store4Val(
double* ptr)
const 995 _mm256_storeu_pd(ptr, ymm);
998 inline void StoreMask(
unsigned char* ptr)
const 1000 _mm256_storeu_si256( reinterpret_cast<__m256i*>(ptr), _mm256_castpd_si256(ymm) );
1009 XMMReg2Double low, high;
1011 #if defined(__GNUC__) 1012 #pragma GCC diagnostic push 1013 #pragma GCC diagnostic ignored "-Weffc++" 1016 XMMReg4Double() =
default;
1017 #if defined(__GNUC__) 1018 #pragma GCC diagnostic pop 1021 XMMReg4Double(
const XMMReg4Double& other) : low(other.low), high(other.high) {}
1023 static inline XMMReg4Double Zero()
1031 static inline XMMReg4Double Load1ValHighAndLow(
const double* ptr)
1034 reg.low.nsLoad1ValHighAndLow(ptr);
1039 static inline XMMReg4Double Load4Val(
const unsigned char* ptr)
1042 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1046 static inline XMMReg4Double Load4Val(
const short* ptr)
1049 reg.low.nsLoad2Val(ptr);
1050 reg.high.nsLoad2Val(ptr+2);
1054 static inline XMMReg4Double Load4Val(
const unsigned short* ptr)
1057 reg.low.nsLoad2Val(ptr);
1058 reg.high.nsLoad2Val(ptr+2);
1062 static inline XMMReg4Double Load4Val(
const double* ptr)
1065 reg.low.nsLoad2Val(ptr);
1066 reg.high.nsLoad2Val(ptr+2);
1070 static inline XMMReg4Double Load4ValAligned(
const double* ptr)
1073 reg.low.nsLoad2ValAligned(ptr);
1074 reg.high.nsLoad2ValAligned(ptr+2);
1078 static inline XMMReg4Double Load4Val(
const float* ptr)
1081 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1085 static inline XMMReg4Double Equals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1088 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1089 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1093 static inline XMMReg4Double NotEquals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1096 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1097 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1101 static inline XMMReg4Double Greater(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1104 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1105 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1109 static inline XMMReg4Double And(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1112 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1113 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1117 static inline XMMReg4Double Ternary(
const XMMReg4Double& cond,
const XMMReg4Double& true_expr,
const XMMReg4Double& false_expr)
1120 reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1121 reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1125 static inline XMMReg4Double Min(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1128 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1129 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1133 inline XMMReg4Double& operator= (
const XMMReg4Double& other)
1140 inline XMMReg4Double& operator+= (
const XMMReg4Double& other)
1147 inline XMMReg4Double& operator*= (
const XMMReg4Double& other)
1154 inline XMMReg4Double operator+ (
const XMMReg4Double& other)
const 1157 ret.low = low + other.low;
1158 ret.high = high + other.high;
1162 inline XMMReg4Double operator- (
const XMMReg4Double& other)
const 1165 ret.low = low - other.low;
1166 ret.high = high - other.high;
1170 inline XMMReg4Double operator* (
const XMMReg4Double& other)
const 1173 ret.low = low * other.low;
1174 ret.high = high * other.high;
1178 inline XMMReg4Double operator/ (
const XMMReg4Double& other)
const 1181 ret.low = low / other.low;
1182 ret.high = high / other.high;
1186 void AddToLow(
const XMMReg2Double& other )
1191 inline double GetHorizSum()
const 1193 return (low + high).GetHorizSum();
1196 inline void Store4Val(
unsigned char* ptr)
const 1199 high.Store2Val(ptr+2);
1202 inline void Store4Val(
unsigned short* ptr)
const 1206 high.Store2Val(ptr+2);
1208 __m128i xmm0 = _mm_cvtpd_epi32 (low.xmm);
1209 __m128i xmm1 = _mm_cvtpd_epi32 (high.xmm);
1210 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1212 xmm0 = _mm_packus_epi32(xmm0, xmm0);
1214 xmm0 = _mm_add_epi32( xmm0, _mm_set1_epi32(-32768) );
1215 xmm0 = _mm_packs_epi32( xmm0, xmm0 );
1216 xmm0 = _mm_sub_epi16( xmm0, _mm_set1_epi16(-32768) );
1218 GDALCopyXMMToInt64(xmm0, (
GInt64*)ptr);
1222 inline void Store4Val(
float* ptr)
const 1225 high.Store2Val(ptr+2);
1228 inline void Store4Val(
double* ptr)
const 1231 high.Store2Val(ptr+2);
1234 inline void StoreMask(
unsigned char* ptr)
const 1237 high.StoreMask(ptr+16);
Core portability definitions for CPL.
int GInt32
Int32 type.
Definition: cpl_port.h:205
short GInt16
Int16 type.
Definition: cpl_port.h:211
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:213
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:267