30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
39 #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
40 !defined(USE_SSE2_EMULATION)
43 #include <emmintrin.h>
47 #include <smmintrin.h>
50 #include "gdal_priv_templates.hpp"
52 static inline __m128i GDALCopyInt16ToXMM(
const void *ptr)
56 return _mm_cvtsi32_si128(s);
59 static inline __m128i GDALCopyInt32ToXMM(
const void *ptr)
63 return _mm_cvtsi32_si128(i);
66 static inline __m128i GDALCopyInt64ToXMM(
const void *ptr)
68 #if defined(__i386__) || defined(_M_IX86)
69 return _mm_loadl_epi64(
static_cast<const __m128i *
>(ptr));
73 return _mm_cvtsi64_si128(i);
77 static inline void GDALCopyXMMToInt16(
const __m128i xmm,
void *pDest)
79 GInt16 i =
static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
89 #pragma GCC diagnostic push
90 #pragma GCC diagnostic ignored "-Weffc++"
93 XMMReg2Double() =
default;
95 #pragma GCC diagnostic pop
98 XMMReg2Double(
double val) : xmm(_mm_load_sd(&val))
102 XMMReg2Double(
const XMMReg2Double &other) : xmm(other.xmm)
106 static inline XMMReg2Double Zero()
113 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
116 reg.nsLoad1ValHighAndLow(ptr);
120 static inline XMMReg2Double Load2Val(
const double *ptr)
127 static inline XMMReg2Double Load2Val(
const float *ptr)
134 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
137 reg.nsLoad2ValAligned(ptr);
141 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
148 static inline XMMReg2Double Load2Val(
const short *ptr)
155 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
162 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
163 const XMMReg2Double &expr2)
166 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
170 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
171 const XMMReg2Double &expr2)
174 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
178 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
179 const XMMReg2Double &expr2)
182 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
186 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
187 const XMMReg2Double &expr2)
190 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
194 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
195 const XMMReg2Double &true_expr,
196 const XMMReg2Double &false_expr)
199 reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
200 _mm_andnot_pd(cond.xmm, false_expr.xmm));
204 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
205 const XMMReg2Double &expr2)
208 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
212 inline void nsLoad1ValHighAndLow(
const double *ptr)
214 xmm = _mm_load1_pd(ptr);
217 inline void nsLoad2Val(
const double *ptr)
219 xmm = _mm_loadu_pd(ptr);
222 inline void nsLoad2ValAligned(
const double *ptr)
224 xmm = _mm_load_pd(ptr);
227 inline void nsLoad2Val(
const float *ptr)
229 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
232 inline void nsLoad2Val(
const unsigned char *ptr)
234 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
236 xmm_i = _mm_cvtepu8_epi32(xmm_i);
238 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
239 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
241 xmm = _mm_cvtepi32_pd(xmm_i);
244 inline void nsLoad2Val(
const short *ptr)
246 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
248 xmm_i = _mm_cvtepi16_epi32(xmm_i);
250 xmm_i = _mm_unpacklo_epi16(
252 xmm_i = _mm_srai_epi32(
255 xmm = _mm_cvtepi32_pd(xmm_i);
258 inline void nsLoad2Val(
const unsigned short *ptr)
260 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
262 xmm_i = _mm_cvtepu16_epi32(xmm_i);
264 xmm_i = _mm_unpacklo_epi16(
266 _mm_setzero_si128());
268 xmm = _mm_cvtepi32_pd(xmm_i);
271 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
274 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
276 xmm_i = _mm_cvtepu8_epi32(xmm_i);
278 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
279 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
281 low.xmm = _mm_cvtepi32_pd(xmm_i);
283 _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
286 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
290 high.nsLoad2Val(ptr + 2);
293 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
297 high.nsLoad2Val(ptr + 2);
300 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
304 high.nsLoad2Val(ptr + 2);
307 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
310 __m128 temp1 = _mm_loadu_ps(ptr);
311 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
312 low.xmm = _mm_cvtps_pd(temp1);
313 high.xmm = _mm_cvtps_pd(temp2);
316 inline void Zeroize()
318 xmm = _mm_setzero_pd();
321 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
327 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
329 xmm = _mm_add_pd(xmm, other.xmm);
333 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
335 xmm = _mm_mul_pd(xmm, other.xmm);
339 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
342 ret.xmm = _mm_add_pd(xmm, other.xmm);
346 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
349 ret.xmm = _mm_sub_pd(xmm, other.xmm);
353 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
356 ret.xmm = _mm_mul_pd(xmm, other.xmm);
360 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
363 ret.xmm = _mm_div_pd(xmm, other.xmm);
367 inline double GetHorizSum()
const
370 xmm2 = _mm_shuffle_pd(
373 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
376 inline void Store2Val(
double *ptr)
const
378 _mm_storeu_pd(ptr, xmm);
381 inline void Store2ValAligned(
double *ptr)
const
383 _mm_store_pd(ptr, xmm);
386 inline void Store2Val(
float *ptr)
const
388 __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
389 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
392 inline void Store2Val(
unsigned char *ptr)
const
394 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
397 tmp = _mm_packs_epi32(tmp, tmp);
398 tmp = _mm_packus_epi16(tmp, tmp);
399 GDALCopyXMMToInt16(tmp,
reinterpret_cast<GInt16 *
>(ptr));
402 inline void Store2Val(
unsigned short *ptr)
const
404 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
408 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
409 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
412 inline void StoreMask(
unsigned char *ptr)
const
414 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(ptr),
415 _mm_castpd_si128(xmm));
418 inline operator double()
const
420 return _mm_cvtsd_f64(xmm);
426 #ifndef NO_WARN_USE_SSE2_EMULATION
427 #warning "Software emulation of SSE2 !"
440 XMMReg2Double(
double val)
446 XMMReg2Double(
const XMMReg2Double &other) : low(other.low), high(other.high)
450 static inline XMMReg2Double Zero()
457 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
460 reg.nsLoad1ValHighAndLow(ptr);
464 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
465 const XMMReg2Double &expr2)
469 if (expr1.low == expr2.low)
470 memset(&(reg.low), 0xFF,
sizeof(
double));
474 if (expr1.high == expr2.high)
475 memset(&(reg.high), 0xFF,
sizeof(
double));
482 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
483 const XMMReg2Double &expr2)
487 if (expr1.low != expr2.low)
488 memset(&(reg.low), 0xFF,
sizeof(
double));
492 if (expr1.high != expr2.high)
493 memset(&(reg.high), 0xFF,
sizeof(
double));
500 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
501 const XMMReg2Double &expr2)
505 if (expr1.low > expr2.low)
506 memset(&(reg.low), 0xFF,
sizeof(
double));
510 if (expr1.high > expr2.high)
511 memset(&(reg.high), 0xFF,
sizeof(
double));
518 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
519 const XMMReg2Double &expr2)
522 int low1[2], high1[2];
523 int low2[2], high2[2];
524 memcpy(low1, &expr1.low,
sizeof(
double));
525 memcpy(high1, &expr1.high,
sizeof(
double));
526 memcpy(low2, &expr2.low,
sizeof(
double));
527 memcpy(high2, &expr2.high,
sizeof(
double));
530 high1[0] &= high2[0];
531 high1[1] &= high2[1];
532 memcpy(®.low, low1,
sizeof(
double));
533 memcpy(®.high, high1,
sizeof(
double));
537 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
538 const XMMReg2Double &true_expr,
539 const XMMReg2Double &false_expr)
543 reg.low = true_expr.low;
545 reg.low = false_expr.low;
547 reg.high = true_expr.high;
549 reg.high = false_expr.high;
553 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
554 const XMMReg2Double &expr2)
557 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
558 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
562 static inline XMMReg2Double Load2Val(
const double *ptr)
569 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
572 reg.nsLoad2ValAligned(ptr);
576 static inline XMMReg2Double Load2Val(
const float *ptr)
583 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
590 static inline XMMReg2Double Load2Val(
const short *ptr)
597 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
604 inline void nsLoad1ValHighAndLow(
const double *ptr)
610 inline void nsLoad2Val(
const double *ptr)
616 inline void nsLoad2ValAligned(
const double *ptr)
622 inline void nsLoad2Val(
const float *ptr)
628 inline void nsLoad2Val(
const unsigned char *ptr)
634 inline void nsLoad2Val(
const short *ptr)
640 inline void nsLoad2Val(
const unsigned short *ptr)
646 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
655 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
659 high.nsLoad2Val(ptr + 2);
662 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
666 high.nsLoad2Val(ptr + 2);
669 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
673 high.nsLoad2Val(ptr + 2);
676 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
680 high.nsLoad2Val(ptr + 2);
683 inline void Zeroize()
689 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
696 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
703 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
710 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
713 ret.low = low + other.low;
714 ret.high = high + other.high;
718 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
721 ret.low = low - other.low;
722 ret.high = high - other.high;
726 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
729 ret.low = low * other.low;
730 ret.high = high * other.high;
734 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
737 ret.low = low / other.low;
738 ret.high = high / other.high;
742 inline double GetHorizSum()
const
747 inline void Store2Val(
double *ptr)
const
753 inline void Store2ValAligned(
double *ptr)
const
759 inline void Store2Val(
float *ptr)
const
761 ptr[0] =
static_cast<float>(low);
762 ptr[1] =
static_cast<float>(high);
765 void Store2Val(
unsigned char *ptr)
const
767 ptr[0] = (
unsigned char)(low + 0.5);
768 ptr[1] = (
unsigned char)(high + 0.5);
771 void Store2Val(
unsigned short *ptr)
const
774 ptr[1] = (
GUInt16)(high + 0.5);
777 inline void StoreMask(
unsigned char *ptr)
const
779 memcpy(ptr, &low, 8);
780 memcpy(ptr + 8, &high, 8);
783 inline operator double()
const
791 #if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
793 #include <immintrin.h>
800 XMMReg4Double() : ymm(_mm256_setzero_pd())
804 XMMReg4Double(
const XMMReg4Double &other) : ymm(other.ymm)
808 static inline XMMReg4Double Zero()
815 inline void Zeroize()
817 ymm = _mm256_setzero_pd();
820 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
823 reg.nsLoad1ValHighAndLow(ptr);
827 inline void nsLoad1ValHighAndLow(
const double *ptr)
829 ymm = _mm256_set1_pd(*ptr);
832 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
839 inline void nsLoad4Val(
const unsigned char *ptr)
841 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
842 xmm_i = _mm_cvtepu8_epi32(xmm_i);
843 ymm = _mm256_cvtepi32_pd(xmm_i);
846 static inline XMMReg4Double Load4Val(
const short *ptr)
853 inline void nsLoad4Val(
const short *ptr)
855 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
856 xmm_i = _mm_cvtepi16_epi32(xmm_i);
857 ymm = _mm256_cvtepi32_pd(xmm_i);
860 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
867 inline void nsLoad4Val(
const unsigned short *ptr)
869 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
870 xmm_i = _mm_cvtepu16_epi32(xmm_i);
871 ymm = _mm256_cvtepi32_pd(
876 static inline XMMReg4Double Load4Val(
const double *ptr)
883 inline void nsLoad4Val(
const double *ptr)
885 ymm = _mm256_loadu_pd(ptr);
888 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
891 reg.nsLoad4ValAligned(ptr);
895 inline void nsLoad4ValAligned(
const double *ptr)
897 ymm = _mm256_load_pd(ptr);
900 static inline XMMReg4Double Load4Val(
const float *ptr)
907 inline void nsLoad4Val(
const float *ptr)
909 ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
912 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
913 const XMMReg4Double &expr2)
916 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
920 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
921 const XMMReg4Double &expr2)
924 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
928 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
929 const XMMReg4Double &expr2)
932 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
936 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
937 const XMMReg4Double &expr2)
940 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
944 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
945 const XMMReg4Double &true_expr,
946 const XMMReg4Double &false_expr)
949 reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
950 _mm256_andnot_pd(cond.ymm, false_expr.ymm));
954 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
955 const XMMReg4Double &expr2)
958 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
962 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
968 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
970 ymm = _mm256_add_pd(ymm, other.ymm);
974 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
976 ymm = _mm256_mul_pd(ymm, other.ymm);
980 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
983 ret.ymm = _mm256_add_pd(ymm, other.ymm);
987 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
990 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
994 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
997 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
1001 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
1004 ret.ymm = _mm256_div_pd(ymm, other.ymm);
1008 void AddToLow(
const XMMReg2Double &other)
1010 __m256d ymm2 = _mm256_setzero_pd();
1011 ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1012 ymm = _mm256_add_pd(ymm, ymm2);
1015 inline double GetHorizSum()
const
1017 __m256d ymm_tmp1, ymm_tmp2;
1018 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1019 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1020 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1021 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1024 inline void Store4Val(
unsigned char *ptr)
const
1027 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1031 _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1033 GDALCopyXMMToInt32(xmm_i,
reinterpret_cast<GInt32 *
>(ptr));
1036 inline void Store4Val(
unsigned short *ptr)
const
1039 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1040 xmm_i = _mm_packus_epi32(xmm_i, xmm_i);
1041 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
1044 inline void Store4Val(
float *ptr)
const
1046 _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1049 inline void Store4Val(
double *ptr)
const
1051 _mm256_storeu_pd(ptr, ymm);
1054 inline void StoreMask(
unsigned char *ptr)
const
1056 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(ptr),
1057 _mm256_castpd_si256(ymm));
1066 XMMReg2Double low, high;
1068 #if defined(__GNUC__)
1069 #pragma GCC diagnostic push
1070 #pragma GCC diagnostic ignored "-Weffc++"
1073 XMMReg4Double() =
default;
1074 #if defined(__GNUC__)
1075 #pragma GCC diagnostic pop
1078 XMMReg4Double(
const XMMReg4Double &other) : low(other.low), high(other.high)
1082 static inline XMMReg4Double Zero()
1090 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
1093 reg.low.nsLoad1ValHighAndLow(ptr);
1098 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
1101 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1105 static inline XMMReg4Double Load4Val(
const short *ptr)
1108 reg.low.nsLoad2Val(ptr);
1109 reg.high.nsLoad2Val(ptr + 2);
1113 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
1116 reg.low.nsLoad2Val(ptr);
1117 reg.high.nsLoad2Val(ptr + 2);
1121 static inline XMMReg4Double Load4Val(
const double *ptr)
1124 reg.low.nsLoad2Val(ptr);
1125 reg.high.nsLoad2Val(ptr + 2);
1129 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
1132 reg.low.nsLoad2ValAligned(ptr);
1133 reg.high.nsLoad2ValAligned(ptr + 2);
1137 static inline XMMReg4Double Load4Val(
const float *ptr)
1140 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1144 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
1145 const XMMReg4Double &expr2)
1148 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1149 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1153 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
1154 const XMMReg4Double &expr2)
1157 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1158 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1162 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
1163 const XMMReg4Double &expr2)
1166 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1167 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1171 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
1172 const XMMReg4Double &expr2)
1175 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1176 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1180 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
1181 const XMMReg4Double &true_expr,
1182 const XMMReg4Double &false_expr)
1186 XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1188 XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1192 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
1193 const XMMReg4Double &expr2)
1196 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1197 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1201 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
1208 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
1215 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
1222 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
1225 ret.low = low + other.low;
1226 ret.high = high + other.high;
1230 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
1233 ret.low = low - other.low;
1234 ret.high = high - other.high;
1238 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
1241 ret.low = low * other.low;
1242 ret.high = high * other.high;
1246 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
1249 ret.low = low / other.low;
1250 ret.high = high / other.high;
1254 void AddToLow(
const XMMReg2Double &other)
1259 inline double GetHorizSum()
const
1261 return (low + high).GetHorizSum();
1264 inline void Store4Val(
unsigned char *ptr)
const
1266 #ifdef USE_SSE2_EMULATION
1268 high.Store2Val(ptr + 2);
1270 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
1273 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
1276 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
1277 _mm_castsi128_ps(tmpHigh),
1278 _MM_SHUFFLE(1, 0, 1, 0)));
1279 tmp = _mm_packs_epi32(tmp, tmp);
1280 tmp = _mm_packus_epi16(tmp, tmp);
1281 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
1285 inline void Store4Val(
unsigned short *ptr)
const
1289 high.Store2Val(ptr + 2);
1291 __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
1292 __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
1293 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1295 xmm0 = _mm_packus_epi32(xmm0, xmm0);
1297 xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
1298 xmm0 = _mm_packs_epi32(xmm0, xmm0);
1299 xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
1301 GDALCopyXMMToInt64(xmm0, (
GInt64 *)ptr);
1305 inline void Store4Val(
float *ptr)
const
1308 high.Store2Val(ptr + 2);
1311 inline void Store4Val(
double *ptr)
const
1314 high.Store2Val(ptr + 2);
1317 inline void StoreMask(
unsigned char *ptr)
const
1320 high.StoreMask(ptr + 16);
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition: cpl_port.h:181
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:236
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:183
int GInt32
Int32 type.
Definition: cpl_port.h:175