GDAL
gdalsse_priv.h
1 /******************************************************************************
2  * $Id$
3  *
4  * Project: GDAL
5  * Purpose: SSE2 helper
6  * Author: Even Rouault <even dot rouault at spatialys dot com>
7  *
8  ******************************************************************************
9  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a
12  * copy of this software and associated documentation files (the "Software"),
13  * to deal in the Software without restriction, including without limitation
14  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15  * and/or sell copies of the Software, and to permit persons to whom the
16  * Software is furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included
19  * in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  ****************************************************************************/
29 
30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
32 
33 #ifndef DOXYGEN_SKIP
34 
35 #include "cpl_port.h"
36 
37 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
38 /* Could possibly be used too on 32bit, but we would need to check at runtime */
39 #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
40  !defined(USE_SSE2_EMULATION)
41 
42 /* Requires SSE2 */
43 #include <emmintrin.h>
44 #include <string.h>
45 
46 #ifdef __SSE4_1__
47 #include <smmintrin.h>
48 #endif
49 
50 #include "gdal_priv_templates.hpp"
51 
52 static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
53 {
54  unsigned short s;
55  memcpy(&s, ptr, 2);
56  return _mm_cvtsi32_si128(s);
57 }
58 
59 static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
60 {
61  GInt32 i;
62  memcpy(&i, ptr, 4);
63  return _mm_cvtsi32_si128(i);
64 }
65 
66 static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
67 {
68 #if defined(__i386__) || defined(_M_IX86)
69  return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
70 #else
71  GInt64 i;
72  memcpy(&i, ptr, 8);
73  return _mm_cvtsi64_si128(i);
74 #endif
75 }
76 
77 static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
78 {
79  GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
80  memcpy(pDest, &i, 2);
81 }
82 
83 class XMMReg2Double
84 {
85  public:
86  __m128d xmm;
87 
88 #if defined(__GNUC__)
89 #pragma GCC diagnostic push
90 #pragma GCC diagnostic ignored "-Weffc++"
91 #endif
92  /* coverity[uninit_member] */
93  XMMReg2Double() = default;
94 #if defined(__GNUC__)
95 #pragma GCC diagnostic pop
96 #endif
97 
98  XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
99  {
100  }
101 
102  XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
103  {
104  }
105 
106  static inline XMMReg2Double Zero()
107  {
108  XMMReg2Double reg;
109  reg.Zeroize();
110  return reg;
111  }
112 
113  static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
114  {
115  XMMReg2Double reg;
116  reg.nsLoad1ValHighAndLow(ptr);
117  return reg;
118  }
119 
120  static inline XMMReg2Double Load2Val(const double *ptr)
121  {
122  XMMReg2Double reg;
123  reg.nsLoad2Val(ptr);
124  return reg;
125  }
126 
127  static inline XMMReg2Double Load2Val(const float *ptr)
128  {
129  XMMReg2Double reg;
130  reg.nsLoad2Val(ptr);
131  return reg;
132  }
133 
134  static inline XMMReg2Double Load2ValAligned(const double *ptr)
135  {
136  XMMReg2Double reg;
137  reg.nsLoad2ValAligned(ptr);
138  return reg;
139  }
140 
141  static inline XMMReg2Double Load2Val(const unsigned char *ptr)
142  {
143  XMMReg2Double reg;
144  reg.nsLoad2Val(ptr);
145  return reg;
146  }
147 
148  static inline XMMReg2Double Load2Val(const short *ptr)
149  {
150  XMMReg2Double reg;
151  reg.nsLoad2Val(ptr);
152  return reg;
153  }
154 
155  static inline XMMReg2Double Load2Val(const unsigned short *ptr)
156  {
157  XMMReg2Double reg;
158  reg.nsLoad2Val(ptr);
159  return reg;
160  }
161 
162  static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
163  const XMMReg2Double &expr2)
164  {
165  XMMReg2Double reg;
166  reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
167  return reg;
168  }
169 
170  static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
171  const XMMReg2Double &expr2)
172  {
173  XMMReg2Double reg;
174  reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
175  return reg;
176  }
177 
178  static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
179  const XMMReg2Double &expr2)
180  {
181  XMMReg2Double reg;
182  reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
183  return reg;
184  }
185 
186  static inline XMMReg2Double And(const XMMReg2Double &expr1,
187  const XMMReg2Double &expr2)
188  {
189  XMMReg2Double reg;
190  reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
191  return reg;
192  }
193 
194  static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
195  const XMMReg2Double &true_expr,
196  const XMMReg2Double &false_expr)
197  {
198  XMMReg2Double reg;
199  reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
200  _mm_andnot_pd(cond.xmm, false_expr.xmm));
201  return reg;
202  }
203 
204  static inline XMMReg2Double Min(const XMMReg2Double &expr1,
205  const XMMReg2Double &expr2)
206  {
207  XMMReg2Double reg;
208  reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
209  return reg;
210  }
211 
212  inline void nsLoad1ValHighAndLow(const double *ptr)
213  {
214  xmm = _mm_load1_pd(ptr);
215  }
216 
217  inline void nsLoad2Val(const double *ptr)
218  {
219  xmm = _mm_loadu_pd(ptr);
220  }
221 
222  inline void nsLoad2ValAligned(const double *ptr)
223  {
224  xmm = _mm_load_pd(ptr);
225  }
226 
227  inline void nsLoad2Val(const float *ptr)
228  {
229  xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
230  }
231 
232  inline void nsLoad2Val(const unsigned char *ptr)
233  {
234  __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
235 #ifdef __SSE4_1__
236  xmm_i = _mm_cvtepu8_epi32(xmm_i);
237 #else
238  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
239  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
240 #endif
241  xmm = _mm_cvtepi32_pd(xmm_i);
242  }
243 
244  inline void nsLoad2Val(const short *ptr)
245  {
246  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
247 #ifdef __SSE4_1__
248  xmm_i = _mm_cvtepi16_epi32(xmm_i);
249 #else
250  xmm_i = _mm_unpacklo_epi16(
251  xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
252  xmm_i = _mm_srai_epi32(
253  xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
254 #endif
255  xmm = _mm_cvtepi32_pd(xmm_i);
256  }
257 
258  inline void nsLoad2Val(const unsigned short *ptr)
259  {
260  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
261 #ifdef __SSE4_1__
262  xmm_i = _mm_cvtepu16_epi32(xmm_i);
263 #else
264  xmm_i = _mm_unpacklo_epi16(
265  xmm_i,
266  _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
267 #endif
268  xmm = _mm_cvtepi32_pd(xmm_i);
269  }
270 
271  static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
272  XMMReg2Double &high)
273  {
274  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
275 #ifdef __SSE4_1__
276  xmm_i = _mm_cvtepu8_epi32(xmm_i);
277 #else
278  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
279  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
280 #endif
281  low.xmm = _mm_cvtepi32_pd(xmm_i);
282  high.xmm =
283  _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
284  }
285 
286  static inline void Load4Val(const short *ptr, XMMReg2Double &low,
287  XMMReg2Double &high)
288  {
289  low.nsLoad2Val(ptr);
290  high.nsLoad2Val(ptr + 2);
291  }
292 
293  static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
294  XMMReg2Double &high)
295  {
296  low.nsLoad2Val(ptr);
297  high.nsLoad2Val(ptr + 2);
298  }
299 
300  static inline void Load4Val(const double *ptr, XMMReg2Double &low,
301  XMMReg2Double &high)
302  {
303  low.nsLoad2Val(ptr);
304  high.nsLoad2Val(ptr + 2);
305  }
306 
307  static inline void Load4Val(const float *ptr, XMMReg2Double &low,
308  XMMReg2Double &high)
309  {
310  __m128 temp1 = _mm_loadu_ps(ptr);
311  __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
312  low.xmm = _mm_cvtps_pd(temp1);
313  high.xmm = _mm_cvtps_pd(temp2);
314  }
315 
316  inline void Zeroize()
317  {
318  xmm = _mm_setzero_pd();
319  }
320 
321  inline XMMReg2Double &operator=(const XMMReg2Double &other)
322  {
323  xmm = other.xmm;
324  return *this;
325  }
326 
327  inline XMMReg2Double &operator+=(const XMMReg2Double &other)
328  {
329  xmm = _mm_add_pd(xmm, other.xmm);
330  return *this;
331  }
332 
333  inline XMMReg2Double &operator*=(const XMMReg2Double &other)
334  {
335  xmm = _mm_mul_pd(xmm, other.xmm);
336  return *this;
337  }
338 
339  inline XMMReg2Double operator+(const XMMReg2Double &other) const
340  {
341  XMMReg2Double ret;
342  ret.xmm = _mm_add_pd(xmm, other.xmm);
343  return ret;
344  }
345 
346  inline XMMReg2Double operator-(const XMMReg2Double &other) const
347  {
348  XMMReg2Double ret;
349  ret.xmm = _mm_sub_pd(xmm, other.xmm);
350  return ret;
351  }
352 
353  inline XMMReg2Double operator*(const XMMReg2Double &other) const
354  {
355  XMMReg2Double ret;
356  ret.xmm = _mm_mul_pd(xmm, other.xmm);
357  return ret;
358  }
359 
360  inline XMMReg2Double operator/(const XMMReg2Double &other) const
361  {
362  XMMReg2Double ret;
363  ret.xmm = _mm_div_pd(xmm, other.xmm);
364  return ret;
365  }
366 
367  inline double GetHorizSum() const
368  {
369  __m128d xmm2;
370  xmm2 = _mm_shuffle_pd(
371  xmm, xmm,
372  _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
373  return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
374  }
375 
376  inline void Store2Val(double *ptr) const
377  {
378  _mm_storeu_pd(ptr, xmm);
379  }
380 
381  inline void Store2ValAligned(double *ptr) const
382  {
383  _mm_store_pd(ptr, xmm);
384  }
385 
386  inline void Store2Val(float *ptr) const
387  {
388  __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
389  GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
390  }
391 
392  inline void Store2Val(unsigned char *ptr) const
393  {
394  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
395  xmm,
396  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
397  tmp = _mm_packs_epi32(tmp, tmp);
398  tmp = _mm_packus_epi16(tmp, tmp);
399  GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
400  }
401 
402  inline void Store2Val(unsigned short *ptr) const
403  {
404  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
405  xmm,
406  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
407  // X X X X 0 B 0 A --> X X X X A A B A
408  tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
409  GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
410  }
411 
412  inline void StoreMask(unsigned char *ptr) const
413  {
414  _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
415  _mm_castpd_si128(xmm));
416  }
417 
418  inline operator double() const
419  {
420  return _mm_cvtsd_f64(xmm);
421  }
422 };
423 
424 #else
425 
426 #ifndef NO_WARN_USE_SSE2_EMULATION
427 #warning "Software emulation of SSE2 !"
428 #endif
429 
430 class XMMReg2Double
431 {
432  public:
433  double low;
434  double high;
435 
436  XMMReg2Double()
437  {
438  }
439 
440  XMMReg2Double(double val)
441  {
442  low = val;
443  high = 0.0;
444  }
445 
446  XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
447  {
448  }
449 
450  static inline XMMReg2Double Zero()
451  {
452  XMMReg2Double reg;
453  reg.Zeroize();
454  return reg;
455  }
456 
457  static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
458  {
459  XMMReg2Double reg;
460  reg.nsLoad1ValHighAndLow(ptr);
461  return reg;
462  }
463 
464  static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
465  const XMMReg2Double &expr2)
466  {
467  XMMReg2Double reg;
468 
469  if (expr1.low == expr2.low)
470  memset(&(reg.low), 0xFF, sizeof(double));
471  else
472  reg.low = 0;
473 
474  if (expr1.high == expr2.high)
475  memset(&(reg.high), 0xFF, sizeof(double));
476  else
477  reg.high = 0;
478 
479  return reg;
480  }
481 
482  static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
483  const XMMReg2Double &expr2)
484  {
485  XMMReg2Double reg;
486 
487  if (expr1.low != expr2.low)
488  memset(&(reg.low), 0xFF, sizeof(double));
489  else
490  reg.low = 0;
491 
492  if (expr1.high != expr2.high)
493  memset(&(reg.high), 0xFF, sizeof(double));
494  else
495  reg.high = 0;
496 
497  return reg;
498  }
499 
500  static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
501  const XMMReg2Double &expr2)
502  {
503  XMMReg2Double reg;
504 
505  if (expr1.low > expr2.low)
506  memset(&(reg.low), 0xFF, sizeof(double));
507  else
508  reg.low = 0;
509 
510  if (expr1.high > expr2.high)
511  memset(&(reg.high), 0xFF, sizeof(double));
512  else
513  reg.high = 0;
514 
515  return reg;
516  }
517 
518  static inline XMMReg2Double And(const XMMReg2Double &expr1,
519  const XMMReg2Double &expr2)
520  {
521  XMMReg2Double reg;
522  int low1[2], high1[2];
523  int low2[2], high2[2];
524  memcpy(low1, &expr1.low, sizeof(double));
525  memcpy(high1, &expr1.high, sizeof(double));
526  memcpy(low2, &expr2.low, sizeof(double));
527  memcpy(high2, &expr2.high, sizeof(double));
528  low1[0] &= low2[0];
529  low1[1] &= low2[1];
530  high1[0] &= high2[0];
531  high1[1] &= high2[1];
532  memcpy(&reg.low, low1, sizeof(double));
533  memcpy(&reg.high, high1, sizeof(double));
534  return reg;
535  }
536 
537  static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
538  const XMMReg2Double &true_expr,
539  const XMMReg2Double &false_expr)
540  {
541  XMMReg2Double reg;
542  if (cond.low != 0)
543  reg.low = true_expr.low;
544  else
545  reg.low = false_expr.low;
546  if (cond.high != 0)
547  reg.high = true_expr.high;
548  else
549  reg.high = false_expr.high;
550  return reg;
551  }
552 
553  static inline XMMReg2Double Min(const XMMReg2Double &expr1,
554  const XMMReg2Double &expr2)
555  {
556  XMMReg2Double reg;
557  reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
558  reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
559  return reg;
560  }
561 
562  static inline XMMReg2Double Load2Val(const double *ptr)
563  {
564  XMMReg2Double reg;
565  reg.nsLoad2Val(ptr);
566  return reg;
567  }
568 
569  static inline XMMReg2Double Load2ValAligned(const double *ptr)
570  {
571  XMMReg2Double reg;
572  reg.nsLoad2ValAligned(ptr);
573  return reg;
574  }
575 
576  static inline XMMReg2Double Load2Val(const float *ptr)
577  {
578  XMMReg2Double reg;
579  reg.nsLoad2Val(ptr);
580  return reg;
581  }
582 
583  static inline XMMReg2Double Load2Val(const unsigned char *ptr)
584  {
585  XMMReg2Double reg;
586  reg.nsLoad2Val(ptr);
587  return reg;
588  }
589 
590  static inline XMMReg2Double Load2Val(const short *ptr)
591  {
592  XMMReg2Double reg;
593  reg.nsLoad2Val(ptr);
594  return reg;
595  }
596 
597  static inline XMMReg2Double Load2Val(const unsigned short *ptr)
598  {
599  XMMReg2Double reg;
600  reg.nsLoad2Val(ptr);
601  return reg;
602  }
603 
604  inline void nsLoad1ValHighAndLow(const double *ptr)
605  {
606  low = ptr[0];
607  high = ptr[0];
608  }
609 
610  inline void nsLoad2Val(const double *ptr)
611  {
612  low = ptr[0];
613  high = ptr[1];
614  }
615 
616  inline void nsLoad2ValAligned(const double *ptr)
617  {
618  low = ptr[0];
619  high = ptr[1];
620  }
621 
622  inline void nsLoad2Val(const float *ptr)
623  {
624  low = ptr[0];
625  high = ptr[1];
626  }
627 
628  inline void nsLoad2Val(const unsigned char *ptr)
629  {
630  low = ptr[0];
631  high = ptr[1];
632  }
633 
634  inline void nsLoad2Val(const short *ptr)
635  {
636  low = ptr[0];
637  high = ptr[1];
638  }
639 
640  inline void nsLoad2Val(const unsigned short *ptr)
641  {
642  low = ptr[0];
643  high = ptr[1];
644  }
645 
646  static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
647  XMMReg2Double &high)
648  {
649  low.low = ptr[0];
650  low.high = ptr[1];
651  high.low = ptr[2];
652  high.high = ptr[3];
653  }
654 
655  static inline void Load4Val(const short *ptr, XMMReg2Double &low,
656  XMMReg2Double &high)
657  {
658  low.nsLoad2Val(ptr);
659  high.nsLoad2Val(ptr + 2);
660  }
661 
662  static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
663  XMMReg2Double &high)
664  {
665  low.nsLoad2Val(ptr);
666  high.nsLoad2Val(ptr + 2);
667  }
668 
669  static inline void Load4Val(const double *ptr, XMMReg2Double &low,
670  XMMReg2Double &high)
671  {
672  low.nsLoad2Val(ptr);
673  high.nsLoad2Val(ptr + 2);
674  }
675 
676  static inline void Load4Val(const float *ptr, XMMReg2Double &low,
677  XMMReg2Double &high)
678  {
679  low.nsLoad2Val(ptr);
680  high.nsLoad2Val(ptr + 2);
681  }
682 
683  inline void Zeroize()
684  {
685  low = 0.0;
686  high = 0.0;
687  }
688 
689  inline XMMReg2Double &operator=(const XMMReg2Double &other)
690  {
691  low = other.low;
692  high = other.high;
693  return *this;
694  }
695 
696  inline XMMReg2Double &operator+=(const XMMReg2Double &other)
697  {
698  low += other.low;
699  high += other.high;
700  return *this;
701  }
702 
703  inline XMMReg2Double &operator*=(const XMMReg2Double &other)
704  {
705  low *= other.low;
706  high *= other.high;
707  return *this;
708  }
709 
710  inline XMMReg2Double operator+(const XMMReg2Double &other) const
711  {
712  XMMReg2Double ret;
713  ret.low = low + other.low;
714  ret.high = high + other.high;
715  return ret;
716  }
717 
718  inline XMMReg2Double operator-(const XMMReg2Double &other) const
719  {
720  XMMReg2Double ret;
721  ret.low = low - other.low;
722  ret.high = high - other.high;
723  return ret;
724  }
725 
726  inline XMMReg2Double operator*(const XMMReg2Double &other) const
727  {
728  XMMReg2Double ret;
729  ret.low = low * other.low;
730  ret.high = high * other.high;
731  return ret;
732  }
733 
734  inline XMMReg2Double operator/(const XMMReg2Double &other) const
735  {
736  XMMReg2Double ret;
737  ret.low = low / other.low;
738  ret.high = high / other.high;
739  return ret;
740  }
741 
742  inline double GetHorizSum() const
743  {
744  return low + high;
745  }
746 
747  inline void Store2Val(double *ptr) const
748  {
749  ptr[0] = low;
750  ptr[1] = high;
751  }
752 
753  inline void Store2ValAligned(double *ptr) const
754  {
755  ptr[0] = low;
756  ptr[1] = high;
757  }
758 
759  inline void Store2Val(float *ptr) const
760  {
761  ptr[0] = static_cast<float>(low);
762  ptr[1] = static_cast<float>(high);
763  }
764 
765  void Store2Val(unsigned char *ptr) const
766  {
767  ptr[0] = (unsigned char)(low + 0.5);
768  ptr[1] = (unsigned char)(high + 0.5);
769  }
770 
771  void Store2Val(unsigned short *ptr) const
772  {
773  ptr[0] = (GUInt16)(low + 0.5);
774  ptr[1] = (GUInt16)(high + 0.5);
775  }
776 
777  inline void StoreMask(unsigned char *ptr) const
778  {
779  memcpy(ptr, &low, 8);
780  memcpy(ptr + 8, &high, 8);
781  }
782 
783  inline operator double() const
784  {
785  return low;
786  }
787 };
788 
789 #endif /* defined(__x86_64) || defined(_M_X64) */
790 
791 #if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
792 
793 #include <immintrin.h>
794 
795 class XMMReg4Double
796 {
797  public:
798  __m256d ymm;
799 
800  XMMReg4Double() : ymm(_mm256_setzero_pd())
801  {
802  }
803 
804  XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
805  {
806  }
807 
808  static inline XMMReg4Double Zero()
809  {
810  XMMReg4Double reg;
811  reg.Zeroize();
812  return reg;
813  }
814 
815  inline void Zeroize()
816  {
817  ymm = _mm256_setzero_pd();
818  }
819 
820  static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
821  {
822  XMMReg4Double reg;
823  reg.nsLoad1ValHighAndLow(ptr);
824  return reg;
825  }
826 
827  inline void nsLoad1ValHighAndLow(const double *ptr)
828  {
829  ymm = _mm256_set1_pd(*ptr);
830  }
831 
832  static inline XMMReg4Double Load4Val(const unsigned char *ptr)
833  {
834  XMMReg4Double reg;
835  reg.nsLoad4Val(ptr);
836  return reg;
837  }
838 
839  inline void nsLoad4Val(const unsigned char *ptr)
840  {
841  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
842  xmm_i = _mm_cvtepu8_epi32(xmm_i);
843  ymm = _mm256_cvtepi32_pd(xmm_i);
844  }
845 
846  static inline XMMReg4Double Load4Val(const short *ptr)
847  {
848  XMMReg4Double reg;
849  reg.nsLoad4Val(ptr);
850  return reg;
851  }
852 
853  inline void nsLoad4Val(const short *ptr)
854  {
855  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
856  xmm_i = _mm_cvtepi16_epi32(xmm_i);
857  ymm = _mm256_cvtepi32_pd(xmm_i);
858  }
859 
860  static inline XMMReg4Double Load4Val(const unsigned short *ptr)
861  {
862  XMMReg4Double reg;
863  reg.nsLoad4Val(ptr);
864  return reg;
865  }
866 
867  inline void nsLoad4Val(const unsigned short *ptr)
868  {
869  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
870  xmm_i = _mm_cvtepu16_epi32(xmm_i);
871  ymm = _mm256_cvtepi32_pd(
872  xmm_i); // ok to use signed conversion since we are in the ushort
873  // range, so cannot be interpreted as negative int32
874  }
875 
876  static inline XMMReg4Double Load4Val(const double *ptr)
877  {
878  XMMReg4Double reg;
879  reg.nsLoad4Val(ptr);
880  return reg;
881  }
882 
883  inline void nsLoad4Val(const double *ptr)
884  {
885  ymm = _mm256_loadu_pd(ptr);
886  }
887 
888  static inline XMMReg4Double Load4ValAligned(const double *ptr)
889  {
890  XMMReg4Double reg;
891  reg.nsLoad4ValAligned(ptr);
892  return reg;
893  }
894 
895  inline void nsLoad4ValAligned(const double *ptr)
896  {
897  ymm = _mm256_load_pd(ptr);
898  }
899 
900  static inline XMMReg4Double Load4Val(const float *ptr)
901  {
902  XMMReg4Double reg;
903  reg.nsLoad4Val(ptr);
904  return reg;
905  }
906 
907  inline void nsLoad4Val(const float *ptr)
908  {
909  ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
910  }
911 
912  static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
913  const XMMReg4Double &expr2)
914  {
915  XMMReg4Double reg;
916  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
917  return reg;
918  }
919 
920  static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
921  const XMMReg4Double &expr2)
922  {
923  XMMReg4Double reg;
924  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
925  return reg;
926  }
927 
928  static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
929  const XMMReg4Double &expr2)
930  {
931  XMMReg4Double reg;
932  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
933  return reg;
934  }
935 
936  static inline XMMReg4Double And(const XMMReg4Double &expr1,
937  const XMMReg4Double &expr2)
938  {
939  XMMReg4Double reg;
940  reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
941  return reg;
942  }
943 
944  static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
945  const XMMReg4Double &true_expr,
946  const XMMReg4Double &false_expr)
947  {
948  XMMReg4Double reg;
949  reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
950  _mm256_andnot_pd(cond.ymm, false_expr.ymm));
951  return reg;
952  }
953 
954  static inline XMMReg4Double Min(const XMMReg4Double &expr1,
955  const XMMReg4Double &expr2)
956  {
957  XMMReg4Double reg;
958  reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
959  return reg;
960  }
961 
962  inline XMMReg4Double &operator=(const XMMReg4Double &other)
963  {
964  ymm = other.ymm;
965  return *this;
966  }
967 
968  inline XMMReg4Double &operator+=(const XMMReg4Double &other)
969  {
970  ymm = _mm256_add_pd(ymm, other.ymm);
971  return *this;
972  }
973 
974  inline XMMReg4Double &operator*=(const XMMReg4Double &other)
975  {
976  ymm = _mm256_mul_pd(ymm, other.ymm);
977  return *this;
978  }
979 
980  inline XMMReg4Double operator+(const XMMReg4Double &other) const
981  {
982  XMMReg4Double ret;
983  ret.ymm = _mm256_add_pd(ymm, other.ymm);
984  return ret;
985  }
986 
987  inline XMMReg4Double operator-(const XMMReg4Double &other) const
988  {
989  XMMReg4Double ret;
990  ret.ymm = _mm256_sub_pd(ymm, other.ymm);
991  return ret;
992  }
993 
994  inline XMMReg4Double operator*(const XMMReg4Double &other) const
995  {
996  XMMReg4Double ret;
997  ret.ymm = _mm256_mul_pd(ymm, other.ymm);
998  return ret;
999  }
1000 
1001  inline XMMReg4Double operator/(const XMMReg4Double &other) const
1002  {
1003  XMMReg4Double ret;
1004  ret.ymm = _mm256_div_pd(ymm, other.ymm);
1005  return ret;
1006  }
1007 
1008  void AddToLow(const XMMReg2Double &other)
1009  {
1010  __m256d ymm2 = _mm256_setzero_pd();
1011  ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1012  ymm = _mm256_add_pd(ymm, ymm2);
1013  }
1014 
1015  inline double GetHorizSum() const
1016  {
1017  __m256d ymm_tmp1, ymm_tmp2;
1018  ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1019  ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1020  ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1021  return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1022  }
1023 
1024  inline void Store4Val(unsigned char *ptr) const
1025  {
1026  __m128i xmm_i =
1027  _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1028  // xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1029  // xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1030  xmm_i =
1031  _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1032  (12 << 24))); // SSSE3
1033  GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
1034  }
1035 
1036  inline void Store4Val(unsigned short *ptr) const
1037  {
1038  __m128i xmm_i =
1039  _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1040  xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
1041  GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
1042  }
1043 
1044  inline void Store4Val(float *ptr) const
1045  {
1046  _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1047  }
1048 
1049  inline void Store4Val(double *ptr) const
1050  {
1051  _mm256_storeu_pd(ptr, ymm);
1052  }
1053 
1054  inline void StoreMask(unsigned char *ptr) const
1055  {
1056  _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
1057  _mm256_castpd_si256(ymm));
1058  }
1059 };
1060 
1061 #else
1062 
1063 class XMMReg4Double
1064 {
1065  public:
1066  XMMReg2Double low, high;
1067 
1068 #if defined(__GNUC__)
1069 #pragma GCC diagnostic push
1070 #pragma GCC diagnostic ignored "-Weffc++"
1071 #endif
1072  /* coverity[uninit_member] */
1073  XMMReg4Double() = default;
1074 #if defined(__GNUC__)
1075 #pragma GCC diagnostic pop
1076 #endif
1077 
1078  XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
1079  {
1080  }
1081 
1082  static inline XMMReg4Double Zero()
1083  {
1084  XMMReg4Double reg;
1085  reg.low.Zeroize();
1086  reg.high.Zeroize();
1087  return reg;
1088  }
1089 
1090  static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1091  {
1092  XMMReg4Double reg;
1093  reg.low.nsLoad1ValHighAndLow(ptr);
1094  reg.high = reg.low;
1095  return reg;
1096  }
1097 
1098  static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1099  {
1100  XMMReg4Double reg;
1101  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1102  return reg;
1103  }
1104 
1105  static inline XMMReg4Double Load4Val(const short *ptr)
1106  {
1107  XMMReg4Double reg;
1108  reg.low.nsLoad2Val(ptr);
1109  reg.high.nsLoad2Val(ptr + 2);
1110  return reg;
1111  }
1112 
1113  static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1114  {
1115  XMMReg4Double reg;
1116  reg.low.nsLoad2Val(ptr);
1117  reg.high.nsLoad2Val(ptr + 2);
1118  return reg;
1119  }
1120 
1121  static inline XMMReg4Double Load4Val(const double *ptr)
1122  {
1123  XMMReg4Double reg;
1124  reg.low.nsLoad2Val(ptr);
1125  reg.high.nsLoad2Val(ptr + 2);
1126  return reg;
1127  }
1128 
1129  static inline XMMReg4Double Load4ValAligned(const double *ptr)
1130  {
1131  XMMReg4Double reg;
1132  reg.low.nsLoad2ValAligned(ptr);
1133  reg.high.nsLoad2ValAligned(ptr + 2);
1134  return reg;
1135  }
1136 
1137  static inline XMMReg4Double Load4Val(const float *ptr)
1138  {
1139  XMMReg4Double reg;
1140  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1141  return reg;
1142  }
1143 
1144  static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1145  const XMMReg4Double &expr2)
1146  {
1147  XMMReg4Double reg;
1148  reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1149  reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1150  return reg;
1151  }
1152 
1153  static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1154  const XMMReg4Double &expr2)
1155  {
1156  XMMReg4Double reg;
1157  reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1158  reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1159  return reg;
1160  }
1161 
1162  static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1163  const XMMReg4Double &expr2)
1164  {
1165  XMMReg4Double reg;
1166  reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1167  reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1168  return reg;
1169  }
1170 
1171  static inline XMMReg4Double And(const XMMReg4Double &expr1,
1172  const XMMReg4Double &expr2)
1173  {
1174  XMMReg4Double reg;
1175  reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1176  reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1177  return reg;
1178  }
1179 
1180  static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1181  const XMMReg4Double &true_expr,
1182  const XMMReg4Double &false_expr)
1183  {
1184  XMMReg4Double reg;
1185  reg.low =
1186  XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1187  reg.high =
1188  XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1189  return reg;
1190  }
1191 
1192  static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1193  const XMMReg4Double &expr2)
1194  {
1195  XMMReg4Double reg;
1196  reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1197  reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1198  return reg;
1199  }
1200 
1201  inline XMMReg4Double &operator=(const XMMReg4Double &other)
1202  {
1203  low = other.low;
1204  high = other.high;
1205  return *this;
1206  }
1207 
1208  inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1209  {
1210  low += other.low;
1211  high += other.high;
1212  return *this;
1213  }
1214 
1215  inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1216  {
1217  low *= other.low;
1218  high *= other.high;
1219  return *this;
1220  }
1221 
1222  inline XMMReg4Double operator+(const XMMReg4Double &other) const
1223  {
1224  XMMReg4Double ret;
1225  ret.low = low + other.low;
1226  ret.high = high + other.high;
1227  return ret;
1228  }
1229 
1230  inline XMMReg4Double operator-(const XMMReg4Double &other) const
1231  {
1232  XMMReg4Double ret;
1233  ret.low = low - other.low;
1234  ret.high = high - other.high;
1235  return ret;
1236  }
1237 
1238  inline XMMReg4Double operator*(const XMMReg4Double &other) const
1239  {
1240  XMMReg4Double ret;
1241  ret.low = low * other.low;
1242  ret.high = high * other.high;
1243  return ret;
1244  }
1245 
1246  inline XMMReg4Double operator/(const XMMReg4Double &other) const
1247  {
1248  XMMReg4Double ret;
1249  ret.low = low / other.low;
1250  ret.high = high / other.high;
1251  return ret;
1252  }
1253 
1254  void AddToLow(const XMMReg2Double &other)
1255  {
1256  low += other;
1257  }
1258 
1259  inline double GetHorizSum() const
1260  {
1261  return (low + high).GetHorizSum();
1262  }
1263 
1264  inline void Store4Val(unsigned char *ptr) const
1265  {
1266 #ifdef USE_SSE2_EMULATION
1267  low.Store2Val(ptr);
1268  high.Store2Val(ptr + 2);
1269 #else
1270  __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
1271  low.xmm,
1272  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1273  __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
1274  high.xmm,
1275  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1276  auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
1277  _mm_castsi128_ps(tmpHigh),
1278  _MM_SHUFFLE(1, 0, 1, 0)));
1279  tmp = _mm_packs_epi32(tmp, tmp);
1280  tmp = _mm_packus_epi16(tmp, tmp);
1281  GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
1282 #endif
1283  }
1284 
1285  inline void Store4Val(unsigned short *ptr) const
1286  {
1287 #if 1
1288  low.Store2Val(ptr);
1289  high.Store2Val(ptr + 2);
1290 #else
1291  __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
1292  __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
1293  xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1294 #if __SSE4_1__
1295  xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
1296 #else
1297  xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
1298  xmm0 = _mm_packs_epi32(xmm0, xmm0);
1299  xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
1300 #endif
1301  GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
1302 #endif
1303  }
1304 
1305  inline void Store4Val(float *ptr) const
1306  {
1307  low.Store2Val(ptr);
1308  high.Store2Val(ptr + 2);
1309  }
1310 
1311  inline void Store4Val(double *ptr) const
1312  {
1313  low.Store2Val(ptr);
1314  high.Store2Val(ptr + 2);
1315  }
1316 
1317  inline void StoreMask(unsigned char *ptr) const
1318  {
1319  low.StoreMask(ptr);
1320  high.StoreMask(ptr + 16);
1321  }
1322 };
1323 
1324 #endif
1325 
1326 #endif /* #ifndef DOXYGEN_SKIP */
1327 
1328 #endif /* GDALSSE_PRIV_H_INCLUDED */
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition: cpl_port.h:181
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:236
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:183
int GInt32
Int32 type.
Definition: cpl_port.h:175