libgdal-doc/gdal/gdalsse__priv_8h_source.html

 /******************************************************************************
  * $Id: gdalsse_priv.h e37e476c4cf8f4b0df8995e0d95d5d672fca1a9b 2018-05-05 16:54:18 +0200 Even Rouault $
  *
  * Project:  GDAL
  * Purpose:  SSE2 helper
  * Author:   Even Rouault <even dot rouault at spatialys dot com>
  *
  ******************************************************************************
  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  ****************************************************************************/

 #ifndef GDALSSE_PRIV_H_INCLUDED
 #define GDALSSE_PRIV_H_INCLUDED

 #ifndef DOXYGEN_SKIP

 #include "cpl_port.h"

 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
 /* Could possibly be used too on 32bit, but we would need to check at runtime */
 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)

 /* Requires SSE2 */
 #include <emmintrin.h>
 #include <string.h>

 #ifdef __SSE4_1__
 #include <smmintrin.h>
 #endif

 #include "gdal_priv_templates.hpp"

 static inline __m128i GDALCopyInt16ToXMM(const void* ptr)
 {
 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
     unsigned short s;
     memcpy(&s, ptr, 2);
     return _mm_cvtsi32_si128(s);
 #else
     return _mm_cvtsi32_si128(*static_cast<const unsigned short*>(ptr));
 #endif
 }

 static inline __m128i GDALCopyInt32ToXMM(const void* ptr)
 {
 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
     GInt32 i;
     memcpy(&i, ptr, 4);
     return _mm_cvtsi32_si128(i);
 #else
     return _mm_cvtsi32_si128(*static_cast<const GInt32*>(ptr));
 #endif
 }

 static inline __m128i GDALCopyInt64ToXMM(const void* ptr)
 {
 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
     GInt64 i;
     memcpy(&i, ptr, 8);
     return _mm_cvtsi64_si128(i);
 #else
     return _mm_cvtsi64_si128(*static_cast<const GInt64*>(ptr));
 #endif
 }

 static inline void GDALCopyXMMToInt16(const __m128i xmm, void* pDest)
 {
 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
     GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
     memcpy(pDest, &i, 2);
 #else
     *static_cast<GInt16*>(pDest) = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
 #endif
 }

 class XMMReg2Double
 {
   public:
     __m128d xmm;

 #if defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Weffc++"
 #endif
     /* coverity[uninit_member] */
     XMMReg2Double() = default;
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif

     XMMReg2Double(double  val): xmm(_mm_load_sd (&val)) {}
     XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}

     static inline XMMReg2Double Zero()
     {
         XMMReg2Double reg;
         reg.Zeroize();
         return reg;
     }

     static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad1ValHighAndLow(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2Val(const double* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2Val(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2Val(const float* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2Val(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2ValAligned(const double* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2ValAligned(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2Val(const unsigned char* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2Val(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2Val(const short* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2Val(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2Val(const unsigned short* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2Val(ptr);
         return reg;
     }

     static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
     {
         XMMReg2Double reg;
         reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
         return reg;
     }

     static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
     {
         XMMReg2Double reg;
         reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
         return reg;
     }

     static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
     {
         XMMReg2Double reg;
         reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
         return reg;
     }

     static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
     {
         XMMReg2Double reg;
         reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
         return reg;
     }

     static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
     {
         XMMReg2Double reg;
         reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
         return reg;
     }

     static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
     {
         XMMReg2Double reg;
         reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
         return reg;
     }

     inline void nsLoad1ValHighAndLow(const double* ptr)
     {
         xmm =  _mm_load1_pd(ptr);
     }

     inline void nsLoad2Val(const double* ptr)
     {
         xmm = _mm_loadu_pd(ptr);
     }

     inline void nsLoad2ValAligned(const double* ptr)
     {
         xmm = _mm_load_pd(ptr);
     }

     inline void nsLoad2Val(const float* ptr)
     {
         xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
     }

     inline void nsLoad2Val(const unsigned char* ptr)
     {
         __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
 #ifdef __SSE4_1__
         xmm_i = _mm_cvtepu8_epi32(xmm_i);
 #else
         xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
         xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
 #endif
         xmm = _mm_cvtepi32_pd(xmm_i);
     }

     inline void nsLoad2Val(const short* ptr)
     {
         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
 #ifdef __SSE4_1__
         xmm_i = _mm_cvtepi16_epi32(xmm_i);
 #else
         xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
         xmm_i = _mm_srai_epi32(xmm_i, 16);       /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
 #endif
         xmm = _mm_cvtepi32_pd(xmm_i);
     }

     inline void nsLoad2Val(const unsigned short* ptr)
     {
         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
 #ifdef __SSE4_1__
         xmm_i = _mm_cvtepu16_epi32(xmm_i);
 #else
         xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
 #endif
         xmm = _mm_cvtepi32_pd(xmm_i);
     }

     static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
     {
         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
 #ifdef __SSE4_1__
         xmm_i = _mm_cvtepu8_epi32(xmm_i);
 #else
         xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
         xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
 #endif
         low.xmm = _mm_cvtepi32_pd(xmm_i);
         high.xmm =  _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
     }

     static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
     {
         low.nsLoad2Val(ptr);
         high.nsLoad2Val(ptr+2);
     }

     static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
     {
         low.nsLoad2Val(ptr);
         high.nsLoad2Val(ptr+2);
     }

     static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
     {
         low.nsLoad2Val(ptr);
         high.nsLoad2Val(ptr+2);
     }

     static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
     {
         __m128 temp1 = _mm_loadu_ps(ptr);
         __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
         low.xmm = _mm_cvtps_pd(temp1);
         high.xmm = _mm_cvtps_pd(temp2);
     }

     inline void Zeroize()
     {
         xmm = _mm_setzero_pd();
     }

     inline XMMReg2Double& operator= (const XMMReg2Double& other)
     {
         xmm = other.xmm;
         return *this;
     }

     inline XMMReg2Double& operator+= (const XMMReg2Double& other)
     {
         xmm = _mm_add_pd(xmm, other.xmm);
         return *this;
     }

     inline XMMReg2Double& operator*= (const XMMReg2Double& other)
     {
         xmm = _mm_mul_pd(xmm, other.xmm);
         return *this;
     }

     inline XMMReg2Double operator+ (const XMMReg2Double& other) const
     {
         XMMReg2Double ret;
         ret.xmm = _mm_add_pd(xmm, other.xmm);
         return ret;
     }

     inline XMMReg2Double operator- (const XMMReg2Double& other) const
     {
         XMMReg2Double ret;
         ret.xmm = _mm_sub_pd(xmm, other.xmm);
         return ret;
     }

     inline XMMReg2Double operator* (const XMMReg2Double& other) const
     {
         XMMReg2Double ret;
         ret.xmm = _mm_mul_pd(xmm, other.xmm);
         return ret;
     }

     inline XMMReg2Double operator/ (const XMMReg2Double& other) const
     {
         XMMReg2Double ret;
         ret.xmm = _mm_div_pd(xmm, other.xmm);
         return ret;
     }

     inline double GetHorizSum() const
     {
         __m128d xmm2;
         xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
         return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
     }

     inline void Store2Val(double* ptr) const
     {
         _mm_storeu_pd(ptr, xmm);
     }

     inline void Store2ValAligned(double* ptr) const
     {
         _mm_store_pd(ptr, xmm);
     }

     inline void Store2Val(float* ptr) const
     {
         __m128i xmm_i = _mm_castps_si128( _mm_cvtpd_ps(xmm) );
         GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
     }

     inline void Store2Val(unsigned char* ptr) const
     {
         __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
         // X X X X 0 B 0 A --> 0 X X X X 0 B 0  (srli)
         //                  or X X X X 0 B 0 A
         tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
         tmp = _mm_packus_epi16(tmp, tmp);
         GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16*>(ptr));
     }

     inline void Store2Val(unsigned short* ptr) const
     {
         __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
         // X X X X 0 B 0 A --> 0 X X X X 0 B 0  (srli)
         //                  or X X X X 0 B 0 A
         tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
         GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));

         //ptr[0] = (GUInt16)_mm_extract_epi16(tmp, 0);
         //ptr[1] = (GUInt16)_mm_extract_epi16(tmp, 2);
     }

     inline void StoreMask(unsigned char* ptr) const
     {
         _mm_storeu_si128( reinterpret_cast<__m128i*>(ptr), _mm_castpd_si128(xmm) );
     }

     inline operator double () const
     {
         return _mm_cvtsd_f64(xmm);
     }
 };

 #else

 #warning "Software emulation of SSE2 !"

 class XMMReg2Double
 {
   public:
     double low;
     double high;

     XMMReg2Double() {}
     XMMReg2Double(double  val)  { low = val; high = 0.0; }
     XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}

     static inline XMMReg2Double Zero()
     {
         XMMReg2Double reg;
         reg.Zeroize();
         return reg;
     }

     static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad1ValHighAndLow(ptr);
         return reg;
     }

     static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
     {
         XMMReg2Double reg;

         if (expr1.low == expr2.low)
             memset(&(reg.low), 0xFF, sizeof(double));
         else
             reg.low = 0;

         if (expr1.high == expr2.high)
             memset(&(reg.high), 0xFF, sizeof(double));
         else
             reg.high = 0;

         return reg;
     }

     static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
     {
         XMMReg2Double reg;

         if (expr1.low != expr2.low)
             memset(&(reg.low), 0xFF, sizeof(double));
         else
             reg.low = 0;

         if (expr1.high != expr2.high)
             memset(&(reg.high), 0xFF, sizeof(double));
         else
             reg.high = 0;

         return reg;
     }

     static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
     {
         XMMReg2Double reg;

         if (expr1.low > expr2.low)
             memset(&(reg.low), 0xFF, sizeof(double));
         else
             reg.low = 0;

         if (expr1.high > expr2.high)
             memset(&(reg.high), 0xFF, sizeof(double));
         else
             reg.high = 0;

         return reg;
     }

     static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
     {
         XMMReg2Double reg;
         int low1[2], high1[2];
         int low2[2], high2[2];
         memcpy(low1, &expr1.low, sizeof(double));
         memcpy(high1, &expr1.high, sizeof(double));
         memcpy(low2, &expr2.low, sizeof(double));
         memcpy(high2, &expr2.high, sizeof(double));
         low1[0] &= low2[0];
         low1[1] &= low2[1];
         high1[0] &= high2[0];
         high1[1] &= high2[1];
         memcpy(&reg.low, low1, sizeof(double));
         memcpy(&reg.high, high1, sizeof(double));
         return reg;
     }

     static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
     {
         XMMReg2Double reg;
         if( cond.low )
             reg.low = true_expr.low;
         else
             reg.low = false_expr.low;
         if( cond.high )
             reg.high = true_expr.high;
         else
             reg.high = false_expr.high;
         return reg;
     }

     static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
     {
         XMMReg2Double reg;
         reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
         reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
         return reg;
     }

     static inline XMMReg2Double Load2Val(const double* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2Val(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2ValAligned(const double* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2ValAligned(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2Val(const float* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2Val(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2Val(const unsigned char* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2Val(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2Val(const short* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2Val(ptr);
         return reg;
     }

     static inline XMMReg2Double Load2Val(const unsigned short* ptr)
     {
         XMMReg2Double reg;
         reg.nsLoad2Val(ptr);
         return reg;
     }

     inline void nsLoad1ValHighAndLow(const double* ptr)
     {
         low = ptr[0];
         high = ptr[0];
     }

     inline void nsLoad2Val(const double* ptr)
     {
         low = ptr[0];
         high = ptr[1];
     }

     inline void nsLoad2ValAligned(const double* ptr)
     {
         low = ptr[0];
         high = ptr[1];
     }

     inline void nsLoad2Val(const float* ptr)
     {
         low = ptr[0];
         high = ptr[1];
     }

     inline void nsLoad2Val(const unsigned char* ptr)
     {
         low = ptr[0];
         high = ptr[1];
     }

     inline void nsLoad2Val(const short* ptr)
     {
         low = ptr[0];
         high = ptr[1];
     }

     inline void nsLoad2Val(const unsigned short* ptr)
     {
         low = ptr[0];
         high = ptr[1];
     }

     static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
     {
         low.low = ptr[0];
         low.high = ptr[1];
         high.low = ptr[2];
         high.high = ptr[3];
     }

     static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
     {
         low.nsLoad2Val(ptr);
         high.nsLoad2Val(ptr+2);
     }

     static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
     {
         low.nsLoad2Val(ptr);
         high.nsLoad2Val(ptr+2);
     }

     static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
     {
         low.nsLoad2Val(ptr);
         high.nsLoad2Val(ptr+2);
     }

     static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
     {
         low.nsLoad2Val(ptr);
         high.nsLoad2Val(ptr+2);
     }

     inline void Zeroize()
     {
         low = 0.0;
         high = 0.0;
     }

     inline XMMReg2Double& operator= (const XMMReg2Double& other)
     {
         low = other.low;
         high = other.high;
         return *this;
     }

     inline XMMReg2Double& operator+= (const XMMReg2Double& other)
     {
         low += other.low;
         high += other.high;
         return *this;
     }

     inline XMMReg2Double& operator*= (const XMMReg2Double& other)
     {
         low *= other.low;
         high *= other.high;
         return *this;
     }

     inline XMMReg2Double operator+ (const XMMReg2Double& other) const
     {
         XMMReg2Double ret;
         ret.low = low + other.low;
         ret.high = high + other.high;
         return ret;
     }

     inline XMMReg2Double operator- (const XMMReg2Double& other) const
     {
         XMMReg2Double ret;
         ret.low = low - other.low;
         ret.high = high - other.high;
         return ret;
     }

     inline XMMReg2Double operator* (const XMMReg2Double& other) const
     {
         XMMReg2Double ret;
         ret.low = low * other.low;
         ret.high = high * other.high;
         return ret;
     }

     inline XMMReg2Double operator/ (const XMMReg2Double& other) const
     {
         XMMReg2Double ret;
         ret.low = low / other.low;
         ret.high = high / other.high;
         return ret;
     }

     inline double GetHorizSum() const
     {
         return low + high;
     }

     inline void Store2Val(double* ptr) const
     {
         ptr[0] = low;
         ptr[1] = high;
     }

     inline void Store2ValAligned(double* ptr) const
     {
         ptr[0] = low;
         ptr[1] = high;
     }

     inline void Store2Val(float* ptr) const
     {
         ptr[0] = low;
         ptr[1] = high;
     }

     void Store2Val(unsigned char* ptr) const
     {
         ptr[0] = (unsigned char)(low + 0.5);
         ptr[1] = (unsigned char)(high + 0.5);
     }

     void Store2Val(unsigned short* ptr) const
     {
         ptr[0] = (GUInt16)(low + 0.5);
         ptr[1] = (GUInt16)(high + 0.5);
     }

     inline void StoreMask(unsigned char* ptr) const
     {
         memcpy(ptr, &low, 8);
         memcpy(ptr + 8, &high, 8);
     }

     inline operator double () const
     {
         return low;
     }
 };

 #endif /*  defined(__x86_64) || defined(_M_X64) */

 #ifdef __AVX__

 #include <immintrin.h>

 class XMMReg4Double
 {
   public:
     __m256d ymm;

     XMMReg4Double() {}
     XMMReg4Double(const XMMReg4Double& other) : ymm(other.ymm) {}

     static inline XMMReg4Double Zero()
     {
         XMMReg4Double reg;
         reg.Zeroize();
         return reg;
     }

     inline void Zeroize()
     {
         ymm = _mm256_setzero_pd();
     }

     static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
     {
         XMMReg4Double reg;
         reg.nsLoad1ValHighAndLow(ptr);
         return reg;
     }

     inline void nsLoad1ValHighAndLow(const double* ptr)
     {
         ymm = _mm256_set1_pd(*ptr);
     }

     static inline XMMReg4Double Load4Val(const unsigned char* ptr)
     {
         XMMReg4Double reg;
         reg.nsLoad4Val(ptr);
         return reg;
     }

     inline void nsLoad4Val(const unsigned char* ptr)
     {
         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
         xmm_i = _mm_cvtepu8_epi32(xmm_i);
         ymm = _mm256_cvtepi32_pd(xmm_i);
     }

     static inline XMMReg4Double Load4Val(const short* ptr)
     {
         XMMReg4Double reg;
         reg.nsLoad4Val(ptr);
         return reg;
     }

     inline void nsLoad4Val(const short* ptr)
     {
         __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
         xmm_i = _mm_cvtepi16_epi32(xmm_i);
         ymm = _mm256_cvtepi32_pd(xmm_i);
     }

     static inline XMMReg4Double Load4Val(const unsigned short* ptr)
     {
         XMMReg4Double reg;
         reg.nsLoad4Val(ptr);
         return reg;
     }

     inline void nsLoad4Val(const unsigned short* ptr)
     {
         __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
         xmm_i = _mm_cvtepu16_epi32(xmm_i);
         ymm = _mm256_cvtepi32_pd(xmm_i); // ok to use signed conversion since we are in the ushort range, so cannot be interpreted as negative int32
     }

     static inline XMMReg4Double Load4Val(const double* ptr)
     {
         XMMReg4Double reg;
         reg.nsLoad4Val(ptr);
         return reg;
     }

     inline void nsLoad4Val(const double* ptr)
     {
         ymm = _mm256_loadu_pd(ptr);
     }

     static inline XMMReg4Double Load4ValAligned(const double* ptr)
     {
         XMMReg4Double reg;
         reg.nsLoad4ValAligned(ptr);
         return reg;
     }

     inline void nsLoad4ValAligned(const double* ptr)
     {
         ymm = _mm256_load_pd(ptr);
     }

     static inline XMMReg4Double Load4Val(const float* ptr)
     {
         XMMReg4Double reg;
         reg.nsLoad4Val(ptr);
         return reg;
     }

     inline void nsLoad4Val(const float* ptr)
     {
         ymm = _mm256_cvtps_pd( _mm_loadu_ps(ptr) );
     }

     static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
     {
         XMMReg4Double reg;
         reg.ymm =  _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
         return reg;
     }

     static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
     {
         XMMReg4Double reg;
         reg.ymm =  _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
         return reg;
     }

     static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
     {
         XMMReg4Double reg;
         reg.ymm =  _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
         return reg;
     }

     static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
     {
         XMMReg4Double reg;
         reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
         return reg;
     }

     static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
     {
         XMMReg4Double reg;
         reg.ymm = _mm256_or_pd(_mm256_and_pd (cond.ymm, true_expr.ymm), _mm256_andnot_pd(cond.ymm, false_expr.ymm));
         return reg;
     }

     static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
     {
         XMMReg4Double reg;
         reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
         return reg;
     }

     inline XMMReg4Double& operator= (const XMMReg4Double& other)
     {
         ymm = other.ymm;
         return *this;
     }

     inline XMMReg4Double& operator+= (const XMMReg4Double& other)
     {
         ymm = _mm256_add_pd(ymm, other.ymm);
         return *this;
     }

     inline XMMReg4Double& operator*= (const XMMReg4Double& other)
     {
         ymm = _mm256_mul_pd(ymm, other.ymm);
         return *this;
     }

     inline XMMReg4Double operator+ (const XMMReg4Double& other) const
     {
         XMMReg4Double ret;
         ret.ymm = _mm256_add_pd(ymm, other.ymm);
         return ret;
     }

     inline XMMReg4Double operator- (const XMMReg4Double& other) const
     {
         XMMReg4Double ret;
         ret.ymm = _mm256_sub_pd(ymm, other.ymm);
         return ret;
     }

     inline XMMReg4Double operator* (const XMMReg4Double& other) const
     {
         XMMReg4Double ret;
         ret.ymm = _mm256_mul_pd(ymm, other.ymm);
         return ret;
     }

     inline XMMReg4Double operator/ (const XMMReg4Double& other) const
     {
         XMMReg4Double ret;
         ret.ymm = _mm256_div_pd(ymm, other.ymm);
         return ret;
     }

     void AddToLow( const XMMReg2Double& other )
     {
          __m256d ymm2 = _mm256_setzero_pd();
          ymm2 = _mm256_insertf128_pd( ymm2, other.xmm, 0);
         ymm = _mm256_add_pd(ymm, ymm2);
     }

     inline double GetHorizSum() const
     {
         __m256d ymm_tmp1, ymm_tmp2;
         ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
         ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
         ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
         return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
     }

     inline void Store4Val(unsigned char* ptr) const
     {
         __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
         //xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
         //xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
         xmm_i = _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24))); //  SSSE3
         GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32*>(ptr));
     }

     inline void Store4Val(unsigned short* ptr) const
     {
         __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
         xmm_i = _mm_packus_epi32(xmm_i, xmm_i);   // Pack uint32 to uint16
         GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
     }

     inline void Store4Val(float* ptr) const
     {
         _mm_storeu_ps(ptr, _mm256_cvtpd_ps (ymm));
     }

     inline void Store4Val(double* ptr) const
     {
         _mm256_storeu_pd(ptr, ymm);
     }

     inline void StoreMask(unsigned char* ptr) const
     {
         _mm256_storeu_si256( reinterpret_cast<__m256i*>(ptr), _mm256_castpd_si256(ymm) );
     }
 };

 #else

 class XMMReg4Double
 {
   public:
     XMMReg2Double low, high;

 #if defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Weffc++"
 #endif
     /* coverity[uninit_member] */
     XMMReg4Double() = default;
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif

     XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}

     static inline XMMReg4Double Zero()
     {
         XMMReg4Double reg;
         reg.low.Zeroize();
         reg.high.Zeroize();
         return reg;
     }

     static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
     {
         XMMReg4Double reg;
         reg.low.nsLoad1ValHighAndLow(ptr);
         reg.high = reg.low;
         return reg;
     }

     static inline XMMReg4Double Load4Val(const unsigned char* ptr)
     {
         XMMReg4Double reg;
         XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
         return reg;
     }

     static inline XMMReg4Double Load4Val(const short* ptr)
     {
         XMMReg4Double reg;
         reg.low.nsLoad2Val(ptr);
         reg.high.nsLoad2Val(ptr+2);
         return reg;
     }

     static inline XMMReg4Double Load4Val(const unsigned short* ptr)
     {
         XMMReg4Double reg;
         reg.low.nsLoad2Val(ptr);
         reg.high.nsLoad2Val(ptr+2);
         return reg;
     }

     static inline XMMReg4Double Load4Val(const double* ptr)
     {
         XMMReg4Double reg;
         reg.low.nsLoad2Val(ptr);
         reg.high.nsLoad2Val(ptr+2);
         return reg;
     }

     static inline XMMReg4Double Load4ValAligned(const double* ptr)
     {
         XMMReg4Double reg;
         reg.low.nsLoad2ValAligned(ptr);
         reg.high.nsLoad2ValAligned(ptr+2);
         return reg;
     }

     static inline XMMReg4Double Load4Val(const float* ptr)
     {
         XMMReg4Double reg;
         XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
         return reg;
     }

     static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
     {
         XMMReg4Double reg;
         reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
         reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
         return reg;
     }

     static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
     {
         XMMReg4Double reg;
         reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
         reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
         return reg;
     }

     static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
     {
         XMMReg4Double reg;
         reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
         reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
         return reg;
     }

     static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
     {
         XMMReg4Double reg;
         reg.low = XMMReg2Double::And(expr1.low, expr2.low);
         reg.high = XMMReg2Double::And(expr1.high, expr2.high);
         return reg;
     }

     static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
     {
         XMMReg4Double reg;
         reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
         reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
         return reg;
     }

     static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
     {
         XMMReg4Double reg;
         reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
         reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
         return reg;
     }

     inline XMMReg4Double& operator= (const XMMReg4Double& other)
     {
         low = other.low;
         high = other.high;
         return *this;
     }

     inline XMMReg4Double& operator+= (const XMMReg4Double& other)
     {
         low += other.low;
         high += other.high;
         return *this;
     }

     inline XMMReg4Double& operator*= (const XMMReg4Double& other)
     {
         low *= other.low;
         high *= other.high;
         return *this;
     }

     inline XMMReg4Double operator+ (const XMMReg4Double& other) const
     {
         XMMReg4Double ret;
         ret.low = low + other.low;
         ret.high = high + other.high;
         return ret;
     }

     inline XMMReg4Double operator- (const XMMReg4Double& other) const
     {
         XMMReg4Double ret;
         ret.low = low - other.low;
         ret.high = high - other.high;
         return ret;
     }

     inline XMMReg4Double operator* (const XMMReg4Double& other) const
     {
         XMMReg4Double ret;
         ret.low = low * other.low;
         ret.high = high * other.high;
         return ret;
     }

     inline XMMReg4Double operator/ (const XMMReg4Double& other) const
     {
         XMMReg4Double ret;
         ret.low = low / other.low;
         ret.high = high / other.high;
         return ret;
     }

     void AddToLow( const XMMReg2Double& other )
     {
         low += other;
     }

     inline double GetHorizSum() const
     {
         return (low + high).GetHorizSum();
     }

     inline void Store4Val(unsigned char* ptr) const
     {
         low.Store2Val(ptr);
         high.Store2Val(ptr+2);
     }

     inline void Store4Val(unsigned short* ptr) const
     {
 #if 1
         low.Store2Val(ptr);
         high.Store2Val(ptr+2);
 #else
         __m128i xmm0 = _mm_cvtpd_epi32 (low.xmm);
         __m128i xmm1 = _mm_cvtpd_epi32 (high.xmm);
         xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
 #if __SSE4_1__
         xmm0 = _mm_packus_epi32(xmm0, xmm0);   // Pack uint32 to uint16
 #else
         xmm0 = _mm_add_epi32( xmm0, _mm_set1_epi32(-32768) );
         xmm0 = _mm_packs_epi32( xmm0, xmm0 );
         xmm0 = _mm_sub_epi16( xmm0, _mm_set1_epi16(-32768) );
 #endif
         GDALCopyXMMToInt64(xmm0, (GInt64*)ptr);
 #endif
     }

     inline void Store4Val(float* ptr) const
     {
         low.Store2Val(ptr);
         high.Store2Val(ptr+2);
     }

     inline void Store4Val(double* ptr) const
     {
         low.Store2Val(ptr);
         high.Store2Val(ptr+2);
     }

     inline void StoreMask(unsigned char* ptr) const
     {
         low.StoreMask(ptr);
         high.StoreMask(ptr+16);
     }

 };

 #endif

 #endif /* #ifndef DOXYGEN_SKIP */

 #endif /* GDALSSE_PRIV_H_INCLUDED */
cpl_port.h
Core portability definitions for CPL.

GInt32
int GInt32
Int32 type.
Definition: cpl_port.h:205

GInt16
short GInt16
Int16 type.
Definition: cpl_port.h:211

GUInt16
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:213

GInt64
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:267