EstervQrCode 1.1.1
Library for qr code manipulation
intrin_sse.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44 
45 #ifndef OPENCV_HAL_SSE_HPP
46 #define OPENCV_HAL_SSE_HPP
47 
48 #include <algorithm>
49 #include "opencv2/core/utility.hpp"
50 
51 #define CV_SIMD128 1
52 #define CV_SIMD128_64F 1
53 #define CV_SIMD128_FP16 0 // no native operations with FP16 type.
54 
55 namespace cv
56 {
57 
59 
60 //
61 // Compilation troubleshooting:
62 // - MSVC: error C2719: 'a': formal parameter with requested alignment of 16 won't be aligned
63 // Replace parameter declaration to const reference:
64 // -v_int32x4 a
65 // +const v_int32x4& a
66 //
67 
68 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
69 
71 
72 struct v_uint8x16
73 {
74  typedef uchar lane_type;
75  typedef __m128i vector_type;
76  enum { nlanes = 16 };
77 
78  /* coverity[uninit_ctor]: suppress warning */
79  v_uint8x16() {}
80  explicit v_uint8x16(__m128i v) : val(v) {}
81  v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
82  uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
83  {
84  val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
85  (char)v4, (char)v5, (char)v6, (char)v7,
86  (char)v8, (char)v9, (char)v10, (char)v11,
87  (char)v12, (char)v13, (char)v14, (char)v15);
88  }
89 
90  uchar get0() const
91  {
92  return (uchar)_mm_cvtsi128_si32(val);
93  }
94 
95  __m128i val;
96 };
97 
98 struct v_int8x16
99 {
100  typedef schar lane_type;
101  typedef __m128i vector_type;
102  enum { nlanes = 16 };
103 
104  /* coverity[uninit_ctor]: suppress warning */
105  v_int8x16() {}
106  explicit v_int8x16(__m128i v) : val(v) {}
107  v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
108  schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
109  {
110  val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
111  (char)v4, (char)v5, (char)v6, (char)v7,
112  (char)v8, (char)v9, (char)v10, (char)v11,
113  (char)v12, (char)v13, (char)v14, (char)v15);
114  }
115 
116  schar get0() const
117  {
118  return (schar)_mm_cvtsi128_si32(val);
119  }
120 
121  __m128i val;
122 };
123 
124 struct v_uint16x8
125 {
126  typedef ushort lane_type;
127  typedef __m128i vector_type;
128  enum { nlanes = 8 };
129 
130  /* coverity[uninit_ctor]: suppress warning */
131  v_uint16x8() {}
132  explicit v_uint16x8(__m128i v) : val(v) {}
133  v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
134  {
135  val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
136  (short)v4, (short)v5, (short)v6, (short)v7);
137  }
138 
139  ushort get0() const
140  {
141  return (ushort)_mm_cvtsi128_si32(val);
142  }
143 
144  __m128i val;
145 };
146 
147 struct v_int16x8
148 {
149  typedef short lane_type;
150  typedef __m128i vector_type;
151  enum { nlanes = 8 };
152 
153  /* coverity[uninit_ctor]: suppress warning */
154  v_int16x8() {}
155  explicit v_int16x8(__m128i v) : val(v) {}
156  v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
157  {
158  val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
159  (short)v4, (short)v5, (short)v6, (short)v7);
160  }
161 
162  short get0() const
163  {
164  return (short)_mm_cvtsi128_si32(val);
165  }
166 
167  __m128i val;
168 };
169 
170 struct v_uint32x4
171 {
172  typedef unsigned lane_type;
173  typedef __m128i vector_type;
174  enum { nlanes = 4 };
175 
176  /* coverity[uninit_ctor]: suppress warning */
177  v_uint32x4() {}
178  explicit v_uint32x4(__m128i v) : val(v) {}
179  v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
180  {
181  val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
182  }
183 
184  unsigned get0() const
185  {
186  return (unsigned)_mm_cvtsi128_si32(val);
187  }
188 
189  __m128i val;
190 };
191 
192 struct v_int32x4
193 {
194  typedef int lane_type;
195  typedef __m128i vector_type;
196  enum { nlanes = 4 };
197 
198  /* coverity[uninit_ctor]: suppress warning */
199  v_int32x4() {}
200  explicit v_int32x4(__m128i v) : val(v) {}
201  v_int32x4(int v0, int v1, int v2, int v3)
202  {
203  val = _mm_setr_epi32(v0, v1, v2, v3);
204  }
205 
206  int get0() const
207  {
208  return _mm_cvtsi128_si32(val);
209  }
210 
211  __m128i val;
212 };
213 
214 struct v_float32x4
215 {
216  typedef float lane_type;
217  typedef __m128 vector_type;
218  enum { nlanes = 4 };
219 
220  /* coverity[uninit_ctor]: suppress warning */
221  v_float32x4() {}
222  explicit v_float32x4(__m128 v) : val(v) {}
223  v_float32x4(float v0, float v1, float v2, float v3)
224  {
225  val = _mm_setr_ps(v0, v1, v2, v3);
226  }
227 
228  float get0() const
229  {
230  return _mm_cvtss_f32(val);
231  }
232 
233  __m128 val;
234 };
235 
236 struct v_uint64x2
237 {
238  typedef uint64 lane_type;
239  typedef __m128i vector_type;
240  enum { nlanes = 2 };
241 
242  /* coverity[uninit_ctor]: suppress warning */
243  v_uint64x2() {}
244  explicit v_uint64x2(__m128i v) : val(v) {}
245  v_uint64x2(uint64 v0, uint64 v1)
246  {
247 #if defined(_MSC_VER) && _MSC_VER >= 1920/*MSVS 2019*/ && defined(_M_X64) && !defined(__clang__)
248  val = _mm_setr_epi64x((int64_t)v0, (int64_t)v1);
249 #elif defined(__GNUC__)
250  val = _mm_setr_epi64((__m64)v0, (__m64)v1);
251 #else
252  val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
253 #endif
254  }
255 
256  uint64 get0() const
257  {
258  #if !defined(__x86_64__) && !defined(_M_X64)
259  int a = _mm_cvtsi128_si32(val);
260  int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
261  return (unsigned)a | ((uint64)(unsigned)b << 32);
262  #else
263  return (uint64)_mm_cvtsi128_si64(val);
264  #endif
265  }
266 
267  __m128i val;
268 };
269 
270 struct v_int64x2
271 {
272  typedef int64 lane_type;
273  typedef __m128i vector_type;
274  enum { nlanes = 2 };
275 
276  /* coverity[uninit_ctor]: suppress warning */
277  v_int64x2() {}
278  explicit v_int64x2(__m128i v) : val(v) {}
279  v_int64x2(int64 v0, int64 v1)
280  {
281 #if defined(_MSC_VER) && _MSC_VER >= 1920/*MSVS 2019*/ && defined(_M_X64) && !defined(__clang__)
282  val = _mm_setr_epi64x((int64_t)v0, (int64_t)v1);
283 #elif defined(__GNUC__)
284  val = _mm_setr_epi64((__m64)v0, (__m64)v1);
285 #else
286  val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
287 #endif
288  }
289 
290  int64 get0() const
291  {
292  #if !defined(__x86_64__) && !defined(_M_X64)
293  int a = _mm_cvtsi128_si32(val);
294  int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
295  return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
296  #else
297  return _mm_cvtsi128_si64(val);
298  #endif
299  }
300 
301  __m128i val;
302 };
303 
304 struct v_float64x2
305 {
306  typedef double lane_type;
307  typedef __m128d vector_type;
308  enum { nlanes = 2 };
309 
310  /* coverity[uninit_ctor]: suppress warning */
311  v_float64x2() {}
312  explicit v_float64x2(__m128d v) : val(v) {}
313  v_float64x2(double v0, double v1)
314  {
315  val = _mm_setr_pd(v0, v1);
316  }
317 
318  double get0() const
319  {
320  return _mm_cvtsd_f64(val);
321  }
322 
323  __m128d val;
324 };
325 
326 namespace hal_sse_internal
327 {
328  template <typename to_sse_type, typename from_sse_type>
329  to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
330 
331 #define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
332  template<> inline \
333  to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
334  { return sse_cast_intrin(a); }
335 
336  OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
337  OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
338  OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
339  OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
340  OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
341  OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
342  OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
343  OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
344  OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
345 }
346 
347 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
348 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
349 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
350 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
351 { return _Tpvec(cast(a.val)); }
352 
353 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
354 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
355 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
356 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
357 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
358 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
359 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
360 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
361 
362 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
363 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
364 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
365 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
366 
367 template<typename _Tpvec> inline
368 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
369 template<typename _Tpvec> inline
370 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
371 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
372 { return v_float32x4(_mm_castsi128_ps(a.val)); }
373 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
374 { return v_float32x4(_mm_castsi128_ps(a.val)); }
375 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
376 { return v_float64x2(_mm_castsi128_pd(a.val)); }
377 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
378 { return v_float64x2(_mm_castsi128_pd(a.val)); }
379 
380 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
381 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
382 { return _Tpvec(_mm_castps_si128(a.val)); } \
383 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
384 { return _Tpvec(_mm_castpd_si128(a.val)); }
385 
386 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
387 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
388 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
389 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
390 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
391 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
392 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
393 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
394 
395 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
396 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
397 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
398 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
399 
401 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
402 {
403  __m128i delta = _mm_set1_epi16(255);
404  return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
405  _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
406 }
407 
408 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
409 {
410  __m128i delta = _mm_set1_epi16(255);
411  __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
412  _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
413 }
414 
415 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
416 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
417 
418 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
419 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
420 
421 template<int n> inline
422 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
423 {
424  // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
425  __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
426  return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
427  _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
428 }
429 
430 template<int n> inline
431 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
432 {
433  __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
434  __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
435  _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
436 }
437 
438 template<int n> inline
439 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
440 {
441  __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
442  return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
443  _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
444 }
445 
446 template<int n> inline
447 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
448 {
449  __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
450  __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
451  _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
452 }
453 
454 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
455 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
456 
457 inline void v_pack_store(schar* ptr, const v_int16x8& a)
458 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
459 
460 template<int n> inline
461 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
462 {
463  // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
464  __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
465  return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
466  _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
467 }
468 template<int n> inline
469 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
470 {
471  // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
472  __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
473  __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
474  _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
475 }
476 
477 
478 // byte-wise "mask ? a : b"
479 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
480 {
481 #if CV_SSE4_1
482  return _mm_blendv_epi8(b, a, mask);
483 #else
484  return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
485 #endif
486 }
487 
488 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
489 { return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
490 
491 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
492 {
493  __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
494  __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
495  __m128i r = _mm_packs_epi32(a1, a1);
496  _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
497 }
498 
499 template<int n> inline
500 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
501 {
502  __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
503  __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
504  __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
505  return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
506 }
507 
508 template<int n> inline
509 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
510 {
511  __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
512  __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
513  __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
514  _mm_storel_epi64((__m128i*)ptr, a2);
515 }
516 
517 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
518 {
519 #if CV_SSE4_1
520  return v_uint16x8(_mm_packus_epi32(a.val, b.val));
521 #else
522  __m128i delta32 = _mm_set1_epi32(32768);
523 
524  // preliminary saturate negative values to zero
525  __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
526  __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
527 
528  __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
529  return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
530 #endif
531 }
532 
533 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
534 {
535 #if CV_SSE4_1
536  _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val));
537 #else
538  __m128i delta32 = _mm_set1_epi32(32768);
539  __m128i a1 = _mm_sub_epi32(a.val, delta32);
540  __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
541  _mm_storel_epi64((__m128i*)ptr, r);
542 #endif
543 }
544 
545 template<int n> inline
546 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
547 {
548 #if CV_SSE4_1
549  __m128i delta = _mm_set1_epi32(1 << (n - 1));
550  return v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
551  _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
552 #else
553  __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
554  __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
555  __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
556  __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
557  __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
558  return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
559 #endif
560 }
561 
562 template<int n> inline
563 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
564 {
565 #if CV_SSE4_1
566  __m128i delta = _mm_set1_epi32(1 << (n - 1));
567  __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
568  _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1));
569 #else
570  __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
571  __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
572  __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
573  _mm_storel_epi64((__m128i*)ptr, a2);
574 #endif
575 }
576 
577 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
578 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
579 
580 inline void v_pack_store(short* ptr, const v_int32x4& a)
581 {
582  _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
583 }
584 
585 template<int n> inline
586 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
587 {
588  __m128i delta = _mm_set1_epi32(1 << (n-1));
589  return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
590  _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
591 }
592 
593 template<int n> inline
594 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
595 {
596  __m128i delta = _mm_set1_epi32(1 << (n-1));
597  __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
598  _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
599 }
600 
601 
602 // [a0 0 | b0 0] [a1 0 | b1 0]
603 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
604 {
605  __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
606  __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
607  return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
608 }
609 
610 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
611 {
612  __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
613  _mm_storel_epi64((__m128i*)ptr, a1);
614 }
615 
616 // [a0 0 | b0 0] [a1 0 | b1 0]
617 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
618 {
619  __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
620  __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
621  return v_int32x4(_mm_unpacklo_epi32(v0, v1));
622 }
623 
624 inline void v_pack_store(int* ptr, const v_int64x2& a)
625 {
626  __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
627  _mm_storel_epi64((__m128i*)ptr, a1);
628 }
629 
630 template<int n> inline
631 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
632 {
633  uint64 delta = (uint64)1 << (n-1);
634  v_uint64x2 delta2(delta, delta);
635  __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
636  __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
637  __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
638  __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
639  return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
640 }
641 
642 template<int n> inline
643 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
644 {
645  uint64 delta = (uint64)1 << (n-1);
646  v_uint64x2 delta2(delta, delta);
647  __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
648  __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
649  _mm_storel_epi64((__m128i*)ptr, a2);
650 }
651 
652 inline __m128i v_sign_epi64(__m128i a)
653 {
654  return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
655 }
656 
657 inline __m128i v_srai_epi64(__m128i a, int imm)
658 {
659  __m128i smask = v_sign_epi64(a);
660  return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
661 }
662 
663 template<int n> inline
664 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
665 {
666  int64 delta = (int64)1 << (n-1);
667  v_int64x2 delta2(delta, delta);
668  __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
669  __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
670  __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
671  __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
672  return v_int32x4(_mm_unpacklo_epi32(v0, v1));
673 }
674 
675 template<int n> inline
676 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
677 {
678  int64 delta = (int64)1 << (n-1);
679  v_int64x2 delta2(delta, delta);
680  __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
681  __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
682  _mm_storel_epi64((__m128i*)ptr, a2);
683 }
684 
685 // pack boolean
686 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
687 {
688  __m128i ab = _mm_packs_epi16(a.val, b.val);
689  return v_uint8x16(ab);
690 }
691 
692 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
693  const v_uint32x4& c, const v_uint32x4& d)
694 {
695  __m128i ab = _mm_packs_epi32(a.val, b.val);
696  __m128i cd = _mm_packs_epi32(c.val, d.val);
697  return v_uint8x16(_mm_packs_epi16(ab, cd));
698 }
699 
700 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
701  const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
702  const v_uint64x2& g, const v_uint64x2& h)
703 {
704  __m128i ab = _mm_packs_epi32(a.val, b.val);
705  __m128i cd = _mm_packs_epi32(c.val, d.val);
706  __m128i ef = _mm_packs_epi32(e.val, f.val);
707  __m128i gh = _mm_packs_epi32(g.val, h.val);
708 
709  __m128i abcd = _mm_packs_epi32(ab, cd);
710  __m128i efgh = _mm_packs_epi32(ef, gh);
711  return v_uint8x16(_mm_packs_epi16(abcd, efgh));
712 }
713 
714 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
715  const v_float32x4& m1, const v_float32x4& m2,
716  const v_float32x4& m3)
717 {
718  __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
719  __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
720  __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
721  __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
722 
723  return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
724 }
725 
726 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
727  const v_float32x4& m1, const v_float32x4& m2,
728  const v_float32x4& a)
729 {
730  __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
731  __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
732  __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
733 
734  return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
735 }
736 
737 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
738  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
739  { \
740  return _Tpvec(intrin(a.val, b.val)); \
741  } \
742  inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
743  { \
744  a.val = intrin(a.val, b.val); \
745  return a; \
746  }
747 
748 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
749 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
750 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
751 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
752 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
753 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
754 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
755 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
756 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
757 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
758 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
759 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
760 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
761 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
762 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
763 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
764 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
765 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
766 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
767 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
768 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
769 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
770 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
771 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
772 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
773 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
774 
775 // saturating multiply 8-bit, 16-bit
776 #define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
777  inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
778  { \
779  _Tpwvec c, d; \
780  v_mul_expand(a, b, c, d); \
781  return v_pack(c, d); \
782  } \
783  inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
784  { a = a * b; return a; }
785 
786 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
787 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
788 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
789 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8, v_int32x4)
790 
791 // Multiply and expand
792 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
793  v_uint16x8& c, v_uint16x8& d)
794 {
795  v_uint16x8 a0, a1, b0, b1;
796  v_expand(a, a0, a1);
797  v_expand(b, b0, b1);
798  c = v_mul_wrap(a0, b0);
799  d = v_mul_wrap(a1, b1);
800 }
801 
802 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
803  v_int16x8& c, v_int16x8& d)
804 {
805  v_int16x8 a0, a1, b0, b1;
806  v_expand(a, a0, a1);
807  v_expand(b, b0, b1);
808  c = v_mul_wrap(a0, b0);
809  d = v_mul_wrap(a1, b1);
810 }
811 
812 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
813  v_int32x4& c, v_int32x4& d)
814 {
815  __m128i v0 = _mm_mullo_epi16(a.val, b.val);
816  __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
817  c.val = _mm_unpacklo_epi16(v0, v1);
818  d.val = _mm_unpackhi_epi16(v0, v1);
819 }
820 
821 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
822  v_uint32x4& c, v_uint32x4& d)
823 {
824  __m128i v0 = _mm_mullo_epi16(a.val, b.val);
825  __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
826  c.val = _mm_unpacklo_epi16(v0, v1);
827  d.val = _mm_unpackhi_epi16(v0, v1);
828 }
829 
830 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
831  v_uint64x2& c, v_uint64x2& d)
832 {
833  __m128i c0 = _mm_mul_epu32(a.val, b.val);
834  __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
835  c.val = _mm_unpacklo_epi64(c0, c1);
836  d.val = _mm_unpackhi_epi64(c0, c1);
837 }
838 
839 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
840 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
841 
843 
844 // 16 >> 32
845 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
846 { return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
847 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
848 { return v_dotprod(a, b) + c; }
849 
850 // 32 >> 64
851 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
852 {
853 #if CV_SSE4_1
854  __m128i even = _mm_mul_epi32(a.val, b.val);
855  __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
856  return v_int64x2(_mm_add_epi64(even, odd));
857 #else
858  __m128i even_u = _mm_mul_epu32(a.val, b.val);
859  __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
860  // convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
861  __m128i a_sign = _mm_srai_epi32(a.val, 31);
862  __m128i b_sign = _mm_srai_epi32(b.val, 31);
863  // |x * sign of x
864  __m128i axb = _mm_and_si128(a.val, b_sign);
865  __m128i bxa = _mm_and_si128(b.val, a_sign);
866  // sum of sign corrections
867  __m128i ssum = _mm_add_epi32(bxa, axb);
868  __m128i even_ssum = _mm_slli_epi64(ssum, 32);
869  __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
870  // convert to signed and prod
871  return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
872 #endif
873 }
874 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
875 { return v_dotprod(a, b) + c; }
876 
877 // 8 >> 32
878 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
879 {
880  __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
881  __m128i a1 = _mm_srli_epi16(a.val, 8); // odd
882  __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
883  __m128i b1 = _mm_srli_epi16(b.val, 8);
884  __m128i p0 = _mm_madd_epi16(a0, b0);
885  __m128i p1 = _mm_madd_epi16(a1, b1);
886  return v_uint32x4(_mm_add_epi32(p0, p1));
887 }
888 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
889 { return v_dotprod_expand(a, b) + c; }
890 
891 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
892 {
893  __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
894  __m128i a1 = _mm_srai_epi16(a.val, 8); // odd
895  __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
896  __m128i b1 = _mm_srai_epi16(b.val, 8);
897  __m128i p0 = _mm_madd_epi16(a0, b0);
898  __m128i p1 = _mm_madd_epi16(a1, b1);
899  return v_int32x4(_mm_add_epi32(p0, p1));
900 }
901 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
902 { return v_dotprod_expand(a, b) + c; }
903 
904 // 16 >> 64
905 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
906 {
907  v_uint32x4 c, d;
908  v_mul_expand(a, b, c, d);
909 
910  v_uint64x2 c0, c1, d0, d1;
911  v_expand(c, c0, c1);
912  v_expand(d, d0, d1);
913 
914  c0 += c1; d0 += d1;
915  return v_uint64x2(_mm_add_epi64(
916  _mm_unpacklo_epi64(c0.val, d0.val),
917  _mm_unpackhi_epi64(c0.val, d0.val)
918  ));
919 }
920 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
921 { return v_dotprod_expand(a, b) + c; }
922 
923 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
924 {
925  v_int32x4 prod = v_dotprod(a, b);
926  v_int64x2 c, d;
927  v_expand(prod, c, d);
928  return v_int64x2(_mm_add_epi64(
929  _mm_unpacklo_epi64(c.val, d.val),
930  _mm_unpackhi_epi64(c.val, d.val)
931  ));
932 }
933 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
934 { return v_dotprod_expand(a, b) + c; }
935 
936 // 32 >> 64f
937 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
938 {
939 #if CV_SSE4_1
940  return v_cvt_f64(v_dotprod(a, b));
941 #else
942  v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
944 
945  return v_float64x2(_mm_add_pd(
946  _mm_unpacklo_pd(c.val, d.val),
947  _mm_unpackhi_pd(c.val, d.val)
948  ));
949 #endif
950 }
951 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
952 { return v_dotprod_expand(a, b) + c; }
953 
955 
956 // 16 >> 32
957 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
958 { return v_dotprod(a, b); }
959 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
960 { return v_dotprod(a, b) + c; }
961 
962 // 32 >> 64
963 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
964 { return v_dotprod(a, b); }
965 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
966 { return v_dotprod_fast(a, b) + c; }
967 
968 // 8 >> 32
969 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
970 {
971  __m128i a0 = v_expand_low(a).val;
972  __m128i a1 = v_expand_high(a).val;
973  __m128i b0 = v_expand_low(b).val;
974  __m128i b1 = v_expand_high(b).val;
975  __m128i p0 = _mm_madd_epi16(a0, b0);
976  __m128i p1 = _mm_madd_epi16(a1, b1);
977  return v_uint32x4(_mm_add_epi32(p0, p1));
978 }
979 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
980 { return v_dotprod_expand_fast(a, b) + c; }
981 
982 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
983 {
984 #if CV_SSE4_1
985  __m128i a0 = _mm_cvtepi8_epi16(a.val);
986  __m128i a1 = v_expand_high(a).val;
987  __m128i b0 = _mm_cvtepi8_epi16(b.val);
988  __m128i b1 = v_expand_high(b).val;
989  __m128i p0 = _mm_madd_epi16(a0, b0);
990  __m128i p1 = _mm_madd_epi16(a1, b1);
991  return v_int32x4(_mm_add_epi32(p0, p1));
992 #else
993  return v_dotprod_expand(a, b);
994 #endif
995 }
996 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
997 { return v_dotprod_expand_fast(a, b) + c; }
998 
999 // 16 >> 64
1000 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1001 {
1002  v_uint32x4 c, d;
1003  v_mul_expand(a, b, c, d);
1004 
1005  v_uint64x2 c0, c1, d0, d1;
1006  v_expand(c, c0, c1);
1007  v_expand(d, d0, d1);
1008 
1009  c0 += c1; d0 += d1;
1010  return c0 + d0;
1011 }
1012 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1013 { return v_dotprod_expand_fast(a, b) + c; }
1014 
1015 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1016 {
1017  v_int32x4 prod = v_dotprod(a, b);
1018  v_int64x2 c, d;
1019  v_expand(prod, c, d);
1020  return c + d;
1021 }
1022 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1023 { return v_dotprod_expand_fast(a, b) + c; }
1024 
1025 // 32 >> 64f
1026 v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
1027 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1028 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
1029 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1030 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
1031 
1032 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
1033  OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
1034  OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
1035  OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
1036  inline _Tpvec operator ~ (const _Tpvec& a) \
1037  { \
1038  return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
1039  }
1040 
1041 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
1042 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
1043 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
1044 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
1045 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
1046 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
1047 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
1048 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
1049 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
1050 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
1051 
1052 inline v_float32x4 v_sqrt(const v_float32x4& x)
1053 { return v_float32x4(_mm_sqrt_ps(x.val)); }
1054 
1055 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1056 {
1057  const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
1058  __m128 t = x.val;
1059  __m128 h = _mm_mul_ps(t, _0_5);
1060  t = _mm_rsqrt_ps(t);
1061  t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
1062  return v_float32x4(t);
1063 }
1064 
1065 inline v_float64x2 v_sqrt(const v_float64x2& x)
1066 { return v_float64x2(_mm_sqrt_pd(x.val)); }
1067 
1068 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1069 {
1070  const __m128d v_1 = _mm_set1_pd(1.);
1071  return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
1072 }
1073 
1074 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
1075 inline _Tpuvec v_abs(const _Tpsvec& x) \
1076 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
1077 
1078 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
1079 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
1080 inline v_uint32x4 v_abs(const v_int32x4& x)
1081 {
1082  __m128i s = _mm_srli_epi32(x.val, 31);
1083  __m128i f = _mm_srai_epi32(x.val, 31);
1084  return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
1085 }
1086 inline v_float32x4 v_abs(const v_float32x4& x)
1087 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
1088 inline v_float64x2 v_abs(const v_float64x2& x)
1089 {
1090  return v_float64x2(_mm_and_pd(x.val,
1091  _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
1092 }
1093 
1094 // TODO: exp, log, sin, cos
1095 
1096 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
1097 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1098 { \
1099  return _Tpvec(intrin(a.val, b.val)); \
1100 }
1101 
1102 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
1103 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
1104 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
1105 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
1106 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
1107 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
1108 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
1109 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
1110 
1111 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
1112 {
1113 #if CV_SSE4_1
1114  return v_int8x16(_mm_min_epi8(a.val, b.val));
1115 #else
1116  __m128i delta = _mm_set1_epi8((char)-128);
1117  return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
1118  _mm_xor_si128(b.val, delta))));
1119 #endif
1120 }
1121 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
1122 {
1123 #if CV_SSE4_1
1124  return v_int8x16(_mm_max_epi8(a.val, b.val));
1125 #else
1126  __m128i delta = _mm_set1_epi8((char)-128);
1127  return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
1128  _mm_xor_si128(b.val, delta))));
1129 #endif
1130 }
1131 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
1132 {
1133 #if CV_SSE4_1
1134  return v_uint16x8(_mm_min_epu16(a.val, b.val));
1135 #else
1136  return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
1137 #endif
1138 }
1139 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
1140 {
1141 #if CV_SSE4_1
1142  return v_uint16x8(_mm_max_epu16(a.val, b.val));
1143 #else
1144  return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
1145 #endif
1146 }
1147 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
1148 {
1149 #if CV_SSE4_1
1150  return v_uint32x4(_mm_min_epu32(a.val, b.val));
1151 #else
1152  __m128i delta = _mm_set1_epi32((int)0x80000000);
1153  __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1154  return v_uint32x4(v_select_si128(mask, b.val, a.val));
1155 #endif
1156 }
1157 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
1158 {
1159 #if CV_SSE4_1
1160  return v_uint32x4(_mm_max_epu32(a.val, b.val));
1161 #else
1162  __m128i delta = _mm_set1_epi32((int)0x80000000);
1163  __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1164  return v_uint32x4(v_select_si128(mask, a.val, b.val));
1165 #endif
1166 }
1167 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
1168 {
1169 #if CV_SSE4_1
1170  return v_int32x4(_mm_min_epi32(a.val, b.val));
1171 #else
1172  return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
1173 #endif
1174 }
1175 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
1176 {
1177 #if CV_SSE4_1
1178  return v_int32x4(_mm_max_epi32(a.val, b.val));
1179 #else
1180  return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
1181 #endif
1182 }
1183 
1184 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
1185 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
1186 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1187 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
1188 { \
1189  __m128i not_mask = _mm_set1_epi32(-1); \
1190  return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1191 } \
1192 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1193 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1194 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
1195 { \
1196  __m128i not_mask = _mm_set1_epi32(-1); \
1197  return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1198 } \
1199 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
1200 { \
1201  __m128i smask = _mm_set1_##suffix(sbit); \
1202  return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
1203 } \
1204 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
1205 { \
1206  __m128i smask = _mm_set1_##suffix(sbit); \
1207  return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
1208 } \
1209 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
1210 { \
1211  __m128i smask = _mm_set1_##suffix(sbit); \
1212  __m128i not_mask = _mm_set1_epi32(-1); \
1213  __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
1214  return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1215 } \
1216 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
1217 { \
1218  __m128i smask = _mm_set1_##suffix(sbit); \
1219  __m128i not_mask = _mm_set1_epi32(-1); \
1220  __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
1221  return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1222 } \
1223 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
1224 { \
1225  return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
1226 } \
1227 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1228 { \
1229  return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
1230 } \
1231 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
1232 { \
1233  __m128i not_mask = _mm_set1_epi32(-1); \
1234  return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
1235 } \
1236 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
1237 { \
1238  __m128i not_mask = _mm_set1_epi32(-1); \
1239  return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
1240 }
1241 
1242 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
1243 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
1244 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
1245 
1246 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
1247 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1248 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1249 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1250 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
1251 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1252 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
1253 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1254 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
1255 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1256 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
1257 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1258 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
1259 
1260 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
1261 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
1262 
1263 #if CV_SSE4_1
1264 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1265 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1266 { return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
1267 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1268 { return ~(a == b); }
1269 #else
1270 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1271 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1272 { __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
1273  return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
1274 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1275 { return ~(a == b); }
1276 #endif
1277 
1278 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
1279 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
1280 
1281 inline v_float32x4 v_not_nan(const v_float32x4& a)
1282 { return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
1283 inline v_float64x2 v_not_nan(const v_float64x2& a)
1284 { return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
1285 
1286 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
1287 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
1288 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
1289 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
1290 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
1291 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
1292 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
1293 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
1294 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
1295 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16)
1296 
1297 inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
1298 {
1299  __m128i ad = _mm_srai_epi16(a.val, 8);
1300  __m128i bd = _mm_srai_epi16(b.val, 8);
1301  __m128i p0 = _mm_mullo_epi16(a.val, b.val); // even
1302  __m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd
1303  const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
1304  return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
1305 }
1306 inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
1307 {
1308  return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
1309 }
1310 
1313 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1314 { return v_add_wrap(a - b, b - a); }
1315 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1316 { return v_add_wrap(a - b, b - a); }
1317 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1318 { return v_max(a, b) - v_min(a, b); }
1319 
1320 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1321 {
1322  v_int8x16 d = v_sub_wrap(a, b);
1323  v_int8x16 m = a < b;
1324  return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1325 }
1326 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1327 {
1328  return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1329 }
1330 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1331 {
1332  v_int32x4 d = a - b;
1333  v_int32x4 m = a < b;
1334  return v_reinterpret_as_u32((d ^ m) - m);
1335 }
1336 
1338 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1339 {
1340  v_int8x16 d = a - b;
1341  v_int8x16 m = a < b;
1342  return (d ^ m) - m;
1343  }
1344 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1345 { return v_max(a, b) - v_min(a, b); }
1346 
1347 
1348 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1349 {
1350  return a * b + c;
1351 }
1352 
1353 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1354 {
1355  return v_fma(a, b, c);
1356 }
1357 
1358 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1359 {
1360 #if CV_FMA3
1361  return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
1362 #else
1363  return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
1364 #endif
1365 }
1366 
1367 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1368 {
1369 #if CV_FMA3
1370  return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
1371 #else
1372  return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
1373 #endif
1374 }
1375 
1376 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
1377 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1378 { \
1379  _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
1380  return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
1381 } \
1382 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1383 { \
1384  _Tpvec res = v_fma(a, a, b*b); \
1385  return _Tpvec(_mm_sqrt_##suffix(res.val)); \
1386 } \
1387 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1388 { \
1389  return v_fma(a, a, b*b); \
1390 } \
1391 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1392 { \
1393  return v_fma(a, b, c); \
1394 }
1395 
1396 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
1397 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
1398 
1399 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
1400 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1401 { \
1402  return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1403 } \
1404 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1405 { \
1406  return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1407 } \
1408 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1409 { \
1410  return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1411 } \
1412 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1413 { \
1414  return _Tpsvec(srai(a.val, imm)); \
1415 } \
1416 template<int imm> \
1417 inline _Tpuvec v_shl(const _Tpuvec& a) \
1418 { \
1419  return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1420 } \
1421 template<int imm> \
1422 inline _Tpsvec v_shl(const _Tpsvec& a) \
1423 { \
1424  return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1425 } \
1426 template<int imm> \
1427 inline _Tpuvec v_shr(const _Tpuvec& a) \
1428 { \
1429  return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1430 } \
1431 template<int imm> \
1432 inline _Tpsvec v_shr(const _Tpsvec& a) \
1433 { \
1434  return _Tpsvec(srai(a.val, imm)); \
1435 }
1436 
1437 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
1438 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
1439 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
1440 
1441 namespace hal_sse_internal
1442 {
1443  template <int imm,
1444  bool is_invalid = ((imm < 0) || (imm > 16)),
1445  bool is_first = (imm == 0),
1446  bool is_half = (imm == 8),
1447  bool is_second = (imm == 16),
1448  bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
1449  class v_sse_palignr_u8_class;
1450 
1451  template <int imm>
1452  class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
1453 
1454  template <int imm>
1455  class v_sse_palignr_u8_class<imm, false, true, false, false, false>
1456  {
1457  public:
1458  inline __m128i operator()(const __m128i& a, const __m128i&) const
1459  {
1460  return a;
1461  }
1462  };
1463 
1464  template <int imm>
1465  class v_sse_palignr_u8_class<imm, false, false, true, false, false>
1466  {
1467  public:
1468  inline __m128i operator()(const __m128i& a, const __m128i& b) const
1469  {
1470  return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
1471  }
1472  };
1473 
1474  template <int imm>
1475  class v_sse_palignr_u8_class<imm, false, false, false, true, false>
1476  {
1477  public:
1478  inline __m128i operator()(const __m128i&, const __m128i& b) const
1479  {
1480  return b;
1481  }
1482  };
1483 
1484  template <int imm>
1485  class v_sse_palignr_u8_class<imm, false, false, false, false, true>
1486  {
1487 #if CV_SSSE3
1488  public:
1489  inline __m128i operator()(const __m128i& a, const __m128i& b) const
1490  {
1491  return _mm_alignr_epi8(b, a, imm);
1492  }
1493 #else
1494  public:
1495  inline __m128i operator()(const __m128i& a, const __m128i& b) const
1496  {
1497  enum { imm2 = (sizeof(__m128i) - imm) };
1498  return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
1499  }
1500 #endif
1501  };
1502 
1503  template <int imm>
1504  inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
1505  {
1506  CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
1507  return v_sse_palignr_u8_class<imm>()(a, b);
1508  }
1509 }
1510 
1511 template<int imm, typename _Tpvec>
1512 inline _Tpvec v_rotate_right(const _Tpvec &a)
1513 {
1514  using namespace hal_sse_internal;
1515  enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1516  return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1517  _mm_srli_si128(
1518  v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1519 }
1520 
1521 template<int imm, typename _Tpvec>
1522 inline _Tpvec v_rotate_left(const _Tpvec &a)
1523 {
1524  using namespace hal_sse_internal;
1525  enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1526  return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1527  _mm_slli_si128(
1528  v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1529 }
1530 
1531 template<int imm, typename _Tpvec>
1532 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1533 {
1534  using namespace hal_sse_internal;
1535  enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1536  return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1537  v_sse_palignr_u8<imm2>(
1538  v_sse_reinterpret_as<__m128i>(a.val),
1539  v_sse_reinterpret_as<__m128i>(b.val))));
1540 }
1541 
1542 template<int imm, typename _Tpvec>
1543 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1544 {
1545  using namespace hal_sse_internal;
1546  enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1547  return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1548  v_sse_palignr_u8<imm2>(
1549  v_sse_reinterpret_as<__m128i>(b.val),
1550  v_sse_reinterpret_as<__m128i>(a.val))));
1551 }
1552 
1553 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1554 inline _Tpvec v_load(const _Tp* ptr) \
1555 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1556 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1557 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1558 inline _Tpvec v_load_low(const _Tp* ptr) \
1559 { return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1560 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1561 { \
1562  return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1563  _mm_loadl_epi64((const __m128i*)ptr1))); \
1564 } \
1565 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1566 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
1567 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1568 { _mm_store_si128((__m128i*)ptr, a.val); } \
1569 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1570 { _mm_stream_si128((__m128i*)ptr, a.val); } \
1571 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1572 { \
1573  if( mode == hal::STORE_UNALIGNED ) \
1574  _mm_storeu_si128((__m128i*)ptr, a.val); \
1575  else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1576  _mm_stream_si128((__m128i*)ptr, a.val); \
1577  else \
1578  _mm_store_si128((__m128i*)ptr, a.val); \
1579 } \
1580 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1581 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
1582 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1583 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1584 
1585 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
1586 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
1587 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
1588 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
1589 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1590 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
1591 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
1592 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
1593 
1594 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1595 inline _Tpvec v_load(const _Tp* ptr) \
1596 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1597 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1598 { return _Tpvec(_mm_load_##suffix(ptr)); } \
1599 inline _Tpvec v_load_low(const _Tp* ptr) \
1600 { return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1601 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1602 { \
1603  return _Tpvec(_mm_castsi128_##suffix( \
1604  _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1605  _mm_loadl_epi64((const __m128i*)ptr1)))); \
1606 } \
1607 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1608 { _mm_storeu_##suffix(ptr, a.val); } \
1609 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1610 { _mm_store_##suffix(ptr, a.val); } \
1611 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1612 { _mm_stream_##suffix(ptr, a.val); } \
1613 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1614 { \
1615  if( mode == hal::STORE_UNALIGNED ) \
1616  _mm_storeu_##suffix(ptr, a.val); \
1617  else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1618  _mm_stream_##suffix(ptr, a.val); \
1619  else \
1620  _mm_store_##suffix(ptr, a.val); \
1621 } \
1622 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1623 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1624 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1625 { \
1626  __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1627  _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1628 }
1629 
1630 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
1631 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
1632 
1633 inline unsigned v_reduce_sum(const v_uint8x16& a)
1634 {
1635  __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
1636  return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1637 }
1638 inline int v_reduce_sum(const v_int8x16& a)
1639 {
1640  __m128i half = _mm_set1_epi8((schar)-128);
1641  half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
1642  return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
1643 }
1644 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
1645 inline schar v_reduce_##func(const v_int8x16& a) \
1646 { \
1647  __m128i val = a.val; \
1648  __m128i smask = _mm_set1_epi8((schar)-128); \
1649  val = _mm_xor_si128(val, smask); \
1650  val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1651  val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1652  val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1653  val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1654  return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
1655 } \
1656 inline uchar v_reduce_##func(const v_uint8x16& a) \
1657 { \
1658  __m128i val = a.val; \
1659  val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1660  val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1661  val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1662  val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1663  return (uchar)_mm_cvtsi128_si32(val); \
1664 }
1665 OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
1666 OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
1667 
1668 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1669 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1670 { \
1671  __m128i val = a.val; \
1672  val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1673  val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1674  val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1675  return (scalartype)_mm_cvtsi128_si32(val); \
1676 } \
1677 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1678 { \
1679  __m128i val = a.val; \
1680  __m128i smask = _mm_set1_epi16(sbit); \
1681  val = _mm_xor_si128(val, smask); \
1682  val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1683  val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1684  val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1685  return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
1686 }
1687 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
1688 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
1689 
1690 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1691 inline scalartype v_reduce_sum(const _Tpvec& a) \
1692 { \
1693  regtype val = a.val; \
1694  val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1695  val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1696  return (scalartype)_mm_cvt##extract(val); \
1697 }
1698 
1699 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1700 inline scalartype v_reduce_##func(const _Tpvec& a) \
1701 { \
1702  scalartype CV_DECL_ALIGNED(16) buf[4]; \
1703  v_store_aligned(buf, a); \
1704  scalartype s0 = scalar_func(buf[0], buf[1]); \
1705  scalartype s1 = scalar_func(buf[2], buf[3]); \
1706  return scalar_func(s0, s1); \
1707 }
1708 
1709 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1710 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1711 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1712 
1713 inline int v_reduce_sum(const v_int16x8& a)
1714 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1715 inline unsigned v_reduce_sum(const v_uint16x8& a)
1716 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1717 
1718 inline uint64 v_reduce_sum(const v_uint64x2& a)
1719 {
1720  uint64 CV_DECL_ALIGNED(32) idx[2];
1721  v_store_aligned(idx, a);
1722  return idx[0] + idx[1];
1723 }
1724 inline int64 v_reduce_sum(const v_int64x2& a)
1725 {
1726  int64 CV_DECL_ALIGNED(32) idx[2];
1727  v_store_aligned(idx, a);
1728  return idx[0] + idx[1];
1729 }
1730 inline double v_reduce_sum(const v_float64x2& a)
1731 {
1732  double CV_DECL_ALIGNED(32) idx[2];
1733  v_store_aligned(idx, a);
1734  return idx[0] + idx[1];
1735 }
1736 
1737 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1738  const v_float32x4& c, const v_float32x4& d)
1739 {
1740 #if CV_SSE3
1741  __m128 ab = _mm_hadd_ps(a.val, b.val);
1742  __m128 cd = _mm_hadd_ps(c.val, d.val);
1743  return v_float32x4(_mm_hadd_ps(ab, cd));
1744 #else
1745  __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1746  __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1747  return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1748 #endif
1749 }
1750 
1751 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1752 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1753 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1754 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1755 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1756 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1757 
1758 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1759 {
1760  __m128i half = _mm_sad_epu8(a.val, b.val);
1761  return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1762 }
1763 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1764 {
1765  __m128i half = _mm_set1_epi8(0x7f);
1766  half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
1767  return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1768 }
1769 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1770 {
1771  v_uint32x4 l, h;
1772  v_expand(v_absdiff(a, b), l, h);
1773  return v_reduce_sum(l + h);
1774 }
1775 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1776 {
1777  v_uint32x4 l, h;
1778  v_expand(v_absdiff(a, b), l, h);
1779  return v_reduce_sum(l + h);
1780 }
1781 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1782 {
1783  return v_reduce_sum(v_absdiff(a, b));
1784 }
1785 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1786 {
1787  return v_reduce_sum(v_absdiff(a, b));
1788 }
1789 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1790 {
1791  return v_reduce_sum(v_absdiff(a, b));
1792 }
1793 
1794 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1795 {
1796  __m128i m1 = _mm_set1_epi32(0x55555555);
1797  __m128i m2 = _mm_set1_epi32(0x33333333);
1798  __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
1799  __m128i p = a.val;
1800  p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
1801  p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
1802  p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
1803  return v_uint8x16(p);
1804 }
1805 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1806 {
1807  v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1808  p += v_rotate_right<1>(p);
1809  return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1810 }
1811 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1812 {
1813  v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1814  p += v_rotate_right<1>(p);
1815  p += v_rotate_right<2>(p);
1816  return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1817 }
1818 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1819 {
1820  return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
1821 }
1822 inline v_uint8x16 v_popcount(const v_int8x16& a)
1823 { return v_popcount(v_reinterpret_as_u8(a)); }
1824 inline v_uint16x8 v_popcount(const v_int16x8& a)
1825 { return v_popcount(v_reinterpret_as_u16(a)); }
1826 inline v_uint32x4 v_popcount(const v_int32x4& a)
1827 { return v_popcount(v_reinterpret_as_u32(a)); }
1828 inline v_uint64x2 v_popcount(const v_int64x2& a)
1829 { return v_popcount(v_reinterpret_as_u64(a)); }
1830 
1831 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
1832 inline int v_signmask(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)); } \
1833 inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
1834 inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
1835 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
1836 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
1837 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, ps, _mm_castsi128_ps, 15)
1838 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, ps, _mm_castsi128_ps, 15)
1839 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint64x2, pd, _mm_castsi128_pd, 3)
1840 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int64x2, pd, _mm_castsi128_pd, 3)
1841 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, 15)
1842 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, 3)
1843 
1844 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
1845 inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
1846 inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
1847 inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
1848 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)
1849 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_int16x8)
1850 
1851 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1852 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1853 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1854 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1855 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1856 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1857 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1858 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1859 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1860 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1861 
1862 #if CV_SSE4_1
1863 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
1864 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1865 { \
1866  return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
1867 }
1868 
1869 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1870 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1871 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1872 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1873 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1874 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1875 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
1876 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
1877 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
1878 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
1879 
1880 #else // CV_SSE4_1
1881 
1882 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1883 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1884 { \
1885  return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1886 }
1887 
1888 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1889 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1890 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1891 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1892 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1893 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1894 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1895 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1896 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1897 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1898 #endif
1899 
1900 /* Expand */
1901 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1902  inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1903  { \
1904  b0.val = intrin(a.val); \
1905  b1.val = __CV_CAT(intrin, _high)(a.val); \
1906  } \
1907  inline _Tpwvec v_expand_low(const _Tpvec& a) \
1908  { return _Tpwvec(intrin(a.val)); } \
1909  inline _Tpwvec v_expand_high(const _Tpvec& a) \
1910  { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1911  inline _Tpwvec v_load_expand(const _Tp* ptr) \
1912  { \
1913  __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1914  return _Tpwvec(intrin(a)); \
1915  }
1916 
1917 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, _v128_cvtepu8_epi16)
1918 OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16, v_int16x8, schar, _v128_cvtepi8_epi16)
1919 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, _v128_cvtepu16_epi32)
1920 OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8, v_int32x4, short, _v128_cvtepi16_epi32)
1921 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2, unsigned, _v128_cvtepu32_epi64)
1922 OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4, v_int64x2, int, _v128_cvtepi32_epi64)
1923 
1924 #define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin) \
1925  inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1926  { \
1927  typedef int CV_DECL_ALIGNED(1) unaligned_int; \
1928  __m128i a = _mm_cvtsi32_si128(*(const unaligned_int*)ptr); \
1929  return _Tpvec(intrin(a)); \
1930  }
1931 
1932 OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
1933 OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4, schar, _v128_cvtepi8_epi32)
1934 
1935 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1936 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1937 { \
1938  b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1939  b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1940 } \
1941 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1942 { \
1943  __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1944  return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1945 } \
1946 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1947 { \
1948  __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1949  return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1950 } \
1951 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1952 { \
1953  __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1954  c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1955  d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1956 }
1957 
1958 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1959 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1960 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1961 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1962 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1963 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1964 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1965 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1966 
1967 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1968 {
1969 #if CV_SSSE3
1970  static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1971  return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
1972 #else
1973  uchar CV_DECL_ALIGNED(32) d[16];
1974  v_store_aligned(d, a);
1975  return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
1976 #endif
1977 }
1978 
1979 inline v_int8x16 v_reverse(const v_int8x16 &a)
1980 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1981 
1982 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1983 {
1984 #if CV_SSSE3
1985  static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1986  return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
1987 #else
1988  __m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
1989  r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1990  r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1991  return v_uint16x8(r);
1992 #endif
1993 }
1994 
1995 inline v_int16x8 v_reverse(const v_int16x8 &a)
1996 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1997 
1998 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1999 {
2000  return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
2001 }
2002 
2003 inline v_int32x4 v_reverse(const v_int32x4 &a)
2004 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
2005 
2006 inline v_float32x4 v_reverse(const v_float32x4 &a)
2007 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
2008 
2009 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
2010 {
2011  return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
2012 }
2013 
2014 inline v_int64x2 v_reverse(const v_int64x2 &a)
2015 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
2016 
2017 inline v_float64x2 v_reverse(const v_float64x2 &a)
2018 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
2019 
2020 template<int s, typename _Tpvec>
2021 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
2022 {
2023  return v_rotate_right<s>(a, b);
2024 }
2025 
2026 inline v_int32x4 v_round(const v_float32x4& a)
2027 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
2028 
2029 inline v_int32x4 v_floor(const v_float32x4& a)
2030 {
2031  __m128i a1 = _mm_cvtps_epi32(a.val);
2032  __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
2033  return v_int32x4(_mm_add_epi32(a1, mask));
2034 }
2035 
2036 inline v_int32x4 v_ceil(const v_float32x4& a)
2037 {
2038  __m128i a1 = _mm_cvtps_epi32(a.val);
2039  __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
2040  return v_int32x4(_mm_sub_epi32(a1, mask));
2041 }
2042 
2043 inline v_int32x4 v_trunc(const v_float32x4& a)
2044 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
2045 
2046 inline v_int32x4 v_round(const v_float64x2& a)
2047 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
2048 
2049 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2050 {
2051  __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
2052  return v_int32x4(_mm_unpacklo_epi64(ai, bi));
2053 }
2054 
2055 inline v_int32x4 v_floor(const v_float64x2& a)
2056 {
2057  __m128i a1 = _mm_cvtpd_epi32(a.val);
2058  __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
2059  mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
2060  return v_int32x4(_mm_add_epi32(a1, mask));
2061 }
2062 
2063 inline v_int32x4 v_ceil(const v_float64x2& a)
2064 {
2065  __m128i a1 = _mm_cvtpd_epi32(a.val);
2066  __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
2067  mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
2068  return v_int32x4(_mm_sub_epi32(a1, mask));
2069 }
2070 
2071 inline v_int32x4 v_trunc(const v_float64x2& a)
2072 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
2073 
2074 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
2075 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2076  const _Tpvec& a2, const _Tpvec& a3, \
2077  _Tpvec& b0, _Tpvec& b1, \
2078  _Tpvec& b2, _Tpvec& b3) \
2079 { \
2080  __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
2081  __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
2082  __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
2083  __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
2084 \
2085  b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
2086  b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
2087  b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
2088  b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
2089 }
2090 
2091 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2092 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2093 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
2094 
2095 // load deinterleave
2096 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
2097 {
2098  __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2099  __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2100 
2101  __m128i t10 = _mm_unpacklo_epi8(t00, t01);
2102  __m128i t11 = _mm_unpackhi_epi8(t00, t01);
2103 
2104  __m128i t20 = _mm_unpacklo_epi8(t10, t11);
2105  __m128i t21 = _mm_unpackhi_epi8(t10, t11);
2106 
2107  __m128i t30 = _mm_unpacklo_epi8(t20, t21);
2108  __m128i t31 = _mm_unpackhi_epi8(t20, t21);
2109 
2110  a.val = _mm_unpacklo_epi8(t30, t31);
2111  b.val = _mm_unpackhi_epi8(t30, t31);
2112 }
2113 
2114 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
2115 {
2116 #if CV_SSE4_1
2117  const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2118  const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2119  __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
2120  __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2121  __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2122  __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
2123  __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
2124  __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
2125  const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
2126  const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
2127  const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2128  a0 = _mm_shuffle_epi8(a0, sh_b);
2129  b0 = _mm_shuffle_epi8(b0, sh_g);
2130  c0 = _mm_shuffle_epi8(c0, sh_r);
2131  a.val = a0;
2132  b.val = b0;
2133  c.val = c0;
2134 #elif CV_SSSE3
2135  const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
2136  const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
2137  const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
2138 
2139  __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
2140  __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2141  __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2142 
2143  __m128i s0 = _mm_shuffle_epi8(t0, m0);
2144  __m128i s1 = _mm_shuffle_epi8(t1, m1);
2145  __m128i s2 = _mm_shuffle_epi8(t2, m2);
2146 
2147  t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
2148  a.val = _mm_alignr_epi8(s2, t0, 5);
2149 
2150  t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
2151  b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
2152 
2153  t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
2154  c.val = _mm_alignr_epi8(t2, s0, 11);
2155 #else
2156  __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2157  __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2158  __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2159 
2160  __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
2161  __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
2162  __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
2163 
2164  __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
2165  __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
2166  __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
2167 
2168  __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
2169  __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
2170  __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
2171 
2172  a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
2173  b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
2174  c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
2175 #endif
2176 }
2177 
2178 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2179 {
2180  __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
2181  __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
2182  __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
2183  __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
2184 
2185  __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
2186  __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
2187  __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
2188  __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
2189 
2190  u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
2191  u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
2192  u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
2193  u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
2194 
2195  v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
2196  v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
2197  v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
2198  v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
2199 
2200  a.val = _mm_unpacklo_epi8(v0, v1);
2201  b.val = _mm_unpackhi_epi8(v0, v1);
2202  c.val = _mm_unpacklo_epi8(v2, v3);
2203  d.val = _mm_unpackhi_epi8(v2, v3);
2204 }
2205 
2206 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2207 {
2208  __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
2209  __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
2210 
2211  __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
2212  __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
2213  __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
2214  __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
2215 
2216  a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
2217  b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
2218 }
2219 
2220 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2221 {
2222 #if CV_SSE4_1
2223  __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2224  __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
2225  __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
2226  __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
2227  __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
2228  __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
2229 
2230  const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2231  const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2232  const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2233  a0 = _mm_shuffle_epi8(a0, sh_a);
2234  b0 = _mm_shuffle_epi8(b0, sh_b);
2235  c0 = _mm_shuffle_epi8(c0, sh_c);
2236 
2237  a.val = a0;
2238  b.val = b0;
2239  c.val = c0;
2240 #else
2241  __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2242  __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
2243  __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2244 
2245  __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
2246  __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
2247  __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
2248 
2249  __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
2250  __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
2251  __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
2252 
2253  a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
2254  b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
2255  c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
2256 #endif
2257 }
2258 
2259 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2260 {
2261  __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
2262  __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
2263  __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
2264  __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
2265 
2266  __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
2267  __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
2268  __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
2269  __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
2270 
2271  u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
2272  u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
2273  u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
2274  u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
2275 
2276  a.val = _mm_unpacklo_epi16(u0, u1);
2277  b.val = _mm_unpackhi_epi16(u0, u1);
2278  c.val = _mm_unpacklo_epi16(u2, u3);
2279  d.val = _mm_unpackhi_epi16(u2, u3);
2280 }
2281 
2282 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2283 {
2284  __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1
2285  __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
2286 
2287  __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
2288  __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
2289 
2290  a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
2291  b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
2292 }
2293 
2294 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2295 {
2296  __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2297  __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
2298  __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
2299 
2300  __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
2301  __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
2302  __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
2303 
2304  a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
2305  b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
2306  c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
2307 }
2308 
2309 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2310 {
2311  v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
2312  v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
2313  v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
2314  v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
2315 
2316  v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2317 }
2318 
2319 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
2320 {
2321  __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
2322  __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
2323 
2324  a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3
2325  b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3
2326 }
2327 
2328 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
2329 {
2330  __m128 t0 = _mm_loadu_ps(ptr + 0);
2331  __m128 t1 = _mm_loadu_ps(ptr + 4);
2332  __m128 t2 = _mm_loadu_ps(ptr + 8);
2333 
2334  __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
2335  a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
2336 
2337  __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
2338  __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
2339  b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
2340 
2341  __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
2342  c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
2343 }
2344 
2345 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
2346 {
2347  __m128 t0 = _mm_loadu_ps(ptr + 0);
2348  __m128 t1 = _mm_loadu_ps(ptr + 4);
2349  __m128 t2 = _mm_loadu_ps(ptr + 8);
2350  __m128 t3 = _mm_loadu_ps(ptr + 12);
2351  __m128 t02lo = _mm_unpacklo_ps(t0, t2);
2352  __m128 t13lo = _mm_unpacklo_ps(t1, t3);
2353  __m128 t02hi = _mm_unpackhi_ps(t0, t2);
2354  __m128 t13hi = _mm_unpackhi_ps(t1, t3);
2355  a.val = _mm_unpacklo_ps(t02lo, t13lo);
2356  b.val = _mm_unpackhi_ps(t02lo, t13lo);
2357  c.val = _mm_unpacklo_ps(t02hi, t13hi);
2358  d.val = _mm_unpackhi_ps(t02hi, t13hi);
2359 }
2360 
2361 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
2362 {
2363  __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
2364  __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
2365 
2366  a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
2367  b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
2368 }
2369 
2370 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2371 {
2372  __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
2373  __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
2374  __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
2375 
2376  t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
2377 
2378  a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
2379  b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
2380  c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
2381 }
2382 
2383 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
2384  v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2385 {
2386  __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
2387  __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
2388  __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
2389  __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
2390 
2391  a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
2392  b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
2393  c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
2394  d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
2395 }
2396 
2397 // store interleave
2398 
2399 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2401 {
2402  __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
2403  __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
2404 
2405  if( mode == hal::STORE_ALIGNED_NOCACHE )
2406  {
2407  _mm_stream_si128((__m128i*)(ptr), v0);
2408  _mm_stream_si128((__m128i*)(ptr + 16), v1);
2409  }
2410  else if( mode == hal::STORE_ALIGNED )
2411  {
2412  _mm_store_si128((__m128i*)(ptr), v0);
2413  _mm_store_si128((__m128i*)(ptr + 16), v1);
2414  }
2415  else
2416  {
2417  _mm_storeu_si128((__m128i*)(ptr), v0);
2418  _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2419  }
2420 }
2421 
2422 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2424 {
2425 #if CV_SSE4_1
2426  const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2427  const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2428  const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2429  __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2430  __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2431  __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2432 
2433  const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2434  const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2435  __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
2436  __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
2437  __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
2438 #elif CV_SSSE3
2439  const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
2440  const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
2441  const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
2442 
2443  __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
2444  t0 = _mm_alignr_epi8(c.val, t0, 5);
2445  __m128i v0 = _mm_shuffle_epi8(t0, m0);
2446 
2447  __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
2448  t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
2449  __m128i v1 = _mm_shuffle_epi8(t1, m1);
2450 
2451  __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
2452  t2 = _mm_alignr_epi8(t2, a.val, 11);
2453  __m128i v2 = _mm_shuffle_epi8(t2, m2);
2454 #else
2455  __m128i z = _mm_setzero_si128();
2456  __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
2457  __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
2458  __m128i c0 = _mm_unpacklo_epi8(c.val, z);
2459  __m128i c1 = _mm_unpackhi_epi8(c.val, z);
2460 
2461  __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
2462  __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
2463  __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
2464  __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
2465 
2466  __m128i p10 = _mm_unpacklo_epi32(p00, p01);
2467  __m128i p11 = _mm_unpackhi_epi32(p00, p01);
2468  __m128i p12 = _mm_unpacklo_epi32(p02, p03);
2469  __m128i p13 = _mm_unpackhi_epi32(p02, p03);
2470 
2471  __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2472  __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2473  __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2474  __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2475 
2476  p20 = _mm_slli_si128(p20, 1);
2477  p22 = _mm_slli_si128(p22, 1);
2478 
2479  __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
2480  __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
2481  __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
2482  __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
2483 
2484  __m128i p40 = _mm_unpacklo_epi64(p30, p31);
2485  __m128i p41 = _mm_unpackhi_epi64(p30, p31);
2486  __m128i p42 = _mm_unpacklo_epi64(p32, p33);
2487  __m128i p43 = _mm_unpackhi_epi64(p32, p33);
2488 
2489  __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
2490  __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
2491  __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
2492 #endif
2493 
2494  if( mode == hal::STORE_ALIGNED_NOCACHE )
2495  {
2496  _mm_stream_si128((__m128i*)(ptr), v0);
2497  _mm_stream_si128((__m128i*)(ptr + 16), v1);
2498  _mm_stream_si128((__m128i*)(ptr + 32), v2);
2499  }
2500  else if( mode == hal::STORE_ALIGNED )
2501  {
2502  _mm_store_si128((__m128i*)(ptr), v0);
2503  _mm_store_si128((__m128i*)(ptr + 16), v1);
2504  _mm_store_si128((__m128i*)(ptr + 32), v2);
2505  }
2506  else
2507  {
2508  _mm_storeu_si128((__m128i*)(ptr), v0);
2509  _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2510  _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2511  }
2512 }
2513 
2514 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2515  const v_uint8x16& c, const v_uint8x16& d,
2517 {
2518  // a0 a1 a2 a3 ....
2519  // b0 b1 b2 b3 ....
2520  // c0 c1 c2 c3 ....
2521  // d0 d1 d2 d3 ....
2522  __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
2523  __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
2524  __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
2525  __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
2526 
2527  __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
2528  __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
2529  __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
2530  __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
2531 
2532  if( mode == hal::STORE_ALIGNED_NOCACHE )
2533  {
2534  _mm_stream_si128((__m128i*)(ptr), v0);
2535  _mm_stream_si128((__m128i*)(ptr + 16), v1);
2536  _mm_stream_si128((__m128i*)(ptr + 32), v2);
2537  _mm_stream_si128((__m128i*)(ptr + 48), v3);
2538  }
2539  else if( mode == hal::STORE_ALIGNED )
2540  {
2541  _mm_store_si128((__m128i*)(ptr), v0);
2542  _mm_store_si128((__m128i*)(ptr + 16), v1);
2543  _mm_store_si128((__m128i*)(ptr + 32), v2);
2544  _mm_store_si128((__m128i*)(ptr + 48), v3);
2545  }
2546  else
2547  {
2548  _mm_storeu_si128((__m128i*)(ptr), v0);
2549  _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2550  _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2551  _mm_storeu_si128((__m128i*)(ptr + 48), v3);
2552  }
2553 }
2554 
2555 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2557 {
2558  __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
2559  __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
2560 
2561  if( mode == hal::STORE_ALIGNED_NOCACHE )
2562  {
2563  _mm_stream_si128((__m128i*)(ptr), v0);
2564  _mm_stream_si128((__m128i*)(ptr + 8), v1);
2565  }
2566  else if( mode == hal::STORE_ALIGNED )
2567  {
2568  _mm_store_si128((__m128i*)(ptr), v0);
2569  _mm_store_si128((__m128i*)(ptr + 8), v1);
2570  }
2571  else
2572  {
2573  _mm_storeu_si128((__m128i*)(ptr), v0);
2574  _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2575  }
2576 }
2577 
2578 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2579  const v_uint16x8& b, const v_uint16x8& c,
2581 {
2582 #if CV_SSE4_1
2583  const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2584  const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2585  const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2586  __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2587  __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2588  __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2589 
2590  __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
2591  __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
2592  __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
2593 #else
2594  __m128i z = _mm_setzero_si128();
2595  __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
2596  __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
2597  __m128i c0 = _mm_unpacklo_epi16(c.val, z);
2598  __m128i c1 = _mm_unpackhi_epi16(c.val, z);
2599 
2600  __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
2601  __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
2602  __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
2603  __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
2604 
2605  __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2606  __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2607  __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2608  __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2609 
2610  p20 = _mm_slli_si128(p20, 2);
2611  p22 = _mm_slli_si128(p22, 2);
2612 
2613  __m128i p30 = _mm_unpacklo_epi64(p20, p21);
2614  __m128i p31 = _mm_unpackhi_epi64(p20, p21);
2615  __m128i p32 = _mm_unpacklo_epi64(p22, p23);
2616  __m128i p33 = _mm_unpackhi_epi64(p22, p23);
2617 
2618  __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
2619  __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
2620  __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
2621 #endif
2622  if( mode == hal::STORE_ALIGNED_NOCACHE )
2623  {
2624  _mm_stream_si128((__m128i*)(ptr), v0);
2625  _mm_stream_si128((__m128i*)(ptr + 8), v1);
2626  _mm_stream_si128((__m128i*)(ptr + 16), v2);
2627  }
2628  else if( mode == hal::STORE_ALIGNED )
2629  {
2630  _mm_store_si128((__m128i*)(ptr), v0);
2631  _mm_store_si128((__m128i*)(ptr + 8), v1);
2632  _mm_store_si128((__m128i*)(ptr + 16), v2);
2633  }
2634  else
2635  {
2636  _mm_storeu_si128((__m128i*)(ptr), v0);
2637  _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2638  _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2639  }
2640 }
2641 
2642 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2643  const v_uint16x8& c, const v_uint16x8& d,
2645 {
2646  // a0 a1 a2 a3 ....
2647  // b0 b1 b2 b3 ....
2648  // c0 c1 c2 c3 ....
2649  // d0 d1 d2 d3 ....
2650  __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
2651  __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
2652  __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
2653  __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
2654 
2655  __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
2656  __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
2657  __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
2658  __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
2659 
2660  if( mode == hal::STORE_ALIGNED_NOCACHE )
2661  {
2662  _mm_stream_si128((__m128i*)(ptr), v0);
2663  _mm_stream_si128((__m128i*)(ptr + 8), v1);
2664  _mm_stream_si128((__m128i*)(ptr + 16), v2);
2665  _mm_stream_si128((__m128i*)(ptr + 24), v3);
2666  }
2667  else if( mode == hal::STORE_ALIGNED )
2668  {
2669  _mm_store_si128((__m128i*)(ptr), v0);
2670  _mm_store_si128((__m128i*)(ptr + 8), v1);
2671  _mm_store_si128((__m128i*)(ptr + 16), v2);
2672  _mm_store_si128((__m128i*)(ptr + 24), v3);
2673  }
2674  else
2675  {
2676  _mm_storeu_si128((__m128i*)(ptr), v0);
2677  _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2678  _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2679  _mm_storeu_si128((__m128i*)(ptr + 24), v3);
2680  }
2681 }
2682 
2683 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2685 {
2686  __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
2687  __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
2688 
2689  if( mode == hal::STORE_ALIGNED_NOCACHE )
2690  {
2691  _mm_stream_si128((__m128i*)(ptr), v0);
2692  _mm_stream_si128((__m128i*)(ptr + 4), v1);
2693  }
2694  else if( mode == hal::STORE_ALIGNED )
2695  {
2696  _mm_store_si128((__m128i*)(ptr), v0);
2697  _mm_store_si128((__m128i*)(ptr + 4), v1);
2698  }
2699  else
2700  {
2701  _mm_storeu_si128((__m128i*)(ptr), v0);
2702  _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2703  }
2704 }
2705 
2706 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2708 {
2709  v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
2710  v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
2711 
2712  __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
2713  __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
2714  __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
2715 
2716  if( mode == hal::STORE_ALIGNED_NOCACHE )
2717  {
2718  _mm_stream_si128((__m128i*)(ptr), v0);
2719  _mm_stream_si128((__m128i*)(ptr + 4), v1);
2720  _mm_stream_si128((__m128i*)(ptr + 8), v2);
2721  }
2722  else if( mode == hal::STORE_ALIGNED )
2723  {
2724  _mm_store_si128((__m128i*)(ptr), v0);
2725  _mm_store_si128((__m128i*)(ptr + 4), v1);
2726  _mm_store_si128((__m128i*)(ptr + 8), v2);
2727  }
2728  else
2729  {
2730  _mm_storeu_si128((__m128i*)(ptr), v0);
2731  _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2732  _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2733  }
2734 }
2735 
2736 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2737  const v_uint32x4& c, const v_uint32x4& d,
2739 {
2740  v_uint32x4 v0, v1, v2, v3;
2741  v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2742 
2743  if( mode == hal::STORE_ALIGNED_NOCACHE )
2744  {
2745  _mm_stream_si128((__m128i*)(ptr), v0.val);
2746  _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
2747  _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
2748  _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
2749  }
2750  else if( mode == hal::STORE_ALIGNED )
2751  {
2752  _mm_store_si128((__m128i*)(ptr), v0.val);
2753  _mm_store_si128((__m128i*)(ptr + 4), v1.val);
2754  _mm_store_si128((__m128i*)(ptr + 8), v2.val);
2755  _mm_store_si128((__m128i*)(ptr + 12), v3.val);
2756  }
2757  else
2758  {
2759  _mm_storeu_si128((__m128i*)(ptr), v0.val);
2760  _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
2761  _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
2762  _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
2763  }
2764 }
2765 
2766 // 2-channel, float only
2767 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2769 {
2770  __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
2771  __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
2772 
2773  if( mode == hal::STORE_ALIGNED_NOCACHE )
2774  {
2775  _mm_stream_ps(ptr, v0);
2776  _mm_stream_ps(ptr + 4, v1);
2777  }
2778  else if( mode == hal::STORE_ALIGNED )
2779  {
2780  _mm_store_ps(ptr, v0);
2781  _mm_store_ps(ptr + 4, v1);
2782  }
2783  else
2784  {
2785  _mm_storeu_ps(ptr, v0);
2786  _mm_storeu_ps(ptr + 4, v1);
2787  }
2788 }
2789 
2790 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2792 {
2793  __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
2794  __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
2795  __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2796  __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
2797  __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
2798  __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
2799  __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
2800  __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
2801  __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
2802 
2803  if( mode == hal::STORE_ALIGNED_NOCACHE )
2804  {
2805  _mm_stream_ps(ptr, v0);
2806  _mm_stream_ps(ptr + 4, v1);
2807  _mm_stream_ps(ptr + 8, v2);
2808  }
2809  else if( mode == hal::STORE_ALIGNED )
2810  {
2811  _mm_store_ps(ptr, v0);
2812  _mm_store_ps(ptr + 4, v1);
2813  _mm_store_ps(ptr + 8, v2);
2814  }
2815  else
2816  {
2817  _mm_storeu_ps(ptr, v0);
2818  _mm_storeu_ps(ptr + 4, v1);
2819  _mm_storeu_ps(ptr + 8, v2);
2820  }
2821 }
2822 
2823 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2824  const v_float32x4& c, const v_float32x4& d,
2826 {
2827  __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
2828  __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
2829  __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
2830  __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
2831  __m128 v0 = _mm_unpacklo_ps(u0, u1);
2832  __m128 v2 = _mm_unpacklo_ps(u2, u3);
2833  __m128 v1 = _mm_unpackhi_ps(u0, u1);
2834  __m128 v3 = _mm_unpackhi_ps(u2, u3);
2835 
2836  if( mode == hal::STORE_ALIGNED_NOCACHE )
2837  {
2838  _mm_stream_ps(ptr, v0);
2839  _mm_stream_ps(ptr + 4, v1);
2840  _mm_stream_ps(ptr + 8, v2);
2841  _mm_stream_ps(ptr + 12, v3);
2842  }
2843  else if( mode == hal::STORE_ALIGNED )
2844  {
2845  _mm_store_ps(ptr, v0);
2846  _mm_store_ps(ptr + 4, v1);
2847  _mm_store_ps(ptr + 8, v2);
2848  _mm_store_ps(ptr + 12, v3);
2849  }
2850  else
2851  {
2852  _mm_storeu_ps(ptr, v0);
2853  _mm_storeu_ps(ptr + 4, v1);
2854  _mm_storeu_ps(ptr + 8, v2);
2855  _mm_storeu_ps(ptr + 12, v3);
2856  }
2857 }
2858 
2859 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2861 {
2862  __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2863  __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
2864 
2865  if( mode == hal::STORE_ALIGNED_NOCACHE )
2866  {
2867  _mm_stream_si128((__m128i*)(ptr), v0);
2868  _mm_stream_si128((__m128i*)(ptr + 2), v1);
2869  }
2870  else if( mode == hal::STORE_ALIGNED )
2871  {
2872  _mm_store_si128((__m128i*)(ptr), v0);
2873  _mm_store_si128((__m128i*)(ptr + 2), v1);
2874  }
2875  else
2876  {
2877  _mm_storeu_si128((__m128i*)(ptr), v0);
2878  _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2879  }
2880 }
2881 
2882 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2884 {
2885  __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2886  __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
2887  __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
2888 
2889  if( mode == hal::STORE_ALIGNED_NOCACHE )
2890  {
2891  _mm_stream_si128((__m128i*)(ptr), v0);
2892  _mm_stream_si128((__m128i*)(ptr + 2), v1);
2893  _mm_stream_si128((__m128i*)(ptr + 4), v2);
2894  }
2895  else if( mode == hal::STORE_ALIGNED )
2896  {
2897  _mm_store_si128((__m128i*)(ptr), v0);
2898  _mm_store_si128((__m128i*)(ptr + 2), v1);
2899  _mm_store_si128((__m128i*)(ptr + 4), v2);
2900  }
2901  else
2902  {
2903  _mm_storeu_si128((__m128i*)(ptr), v0);
2904  _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2905  _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2906  }
2907 }
2908 
2909 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2910  const v_uint64x2& c, const v_uint64x2& d,
2912 {
2913  __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2914  __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
2915  __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
2916  __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
2917 
2918  if( mode == hal::STORE_ALIGNED_NOCACHE )
2919  {
2920  _mm_stream_si128((__m128i*)(ptr), v0);
2921  _mm_stream_si128((__m128i*)(ptr + 2), v1);
2922  _mm_stream_si128((__m128i*)(ptr + 4), v2);
2923  _mm_stream_si128((__m128i*)(ptr + 6), v3);
2924  }
2925  else if( mode == hal::STORE_ALIGNED )
2926  {
2927  _mm_store_si128((__m128i*)(ptr), v0);
2928  _mm_store_si128((__m128i*)(ptr + 2), v1);
2929  _mm_store_si128((__m128i*)(ptr + 4), v2);
2930  _mm_store_si128((__m128i*)(ptr + 6), v3);
2931  }
2932  else
2933  {
2934  _mm_storeu_si128((__m128i*)(ptr), v0);
2935  _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2936  _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2937  _mm_storeu_si128((__m128i*)(ptr + 6), v3);
2938  }
2939 }
2940 
2941 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2942 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2943 { \
2944  _Tpvec1 a1, b1; \
2945  v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2946  a0 = v_reinterpret_as_##suffix0(a1); \
2947  b0 = v_reinterpret_as_##suffix0(b1); \
2948 } \
2949 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2950 { \
2951  _Tpvec1 a1, b1, c1; \
2952  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2953  a0 = v_reinterpret_as_##suffix0(a1); \
2954  b0 = v_reinterpret_as_##suffix0(b1); \
2955  c0 = v_reinterpret_as_##suffix0(c1); \
2956 } \
2957 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2958 { \
2959  _Tpvec1 a1, b1, c1, d1; \
2960  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2961  a0 = v_reinterpret_as_##suffix0(a1); \
2962  b0 = v_reinterpret_as_##suffix0(b1); \
2963  c0 = v_reinterpret_as_##suffix0(c1); \
2964  d0 = v_reinterpret_as_##suffix0(d1); \
2965 } \
2966 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2967  hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2968 { \
2969  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2970  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2971  v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2972 } \
2973 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2974  const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2975 { \
2976  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2977  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2978  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2979  v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2980 } \
2981 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2982  const _Tpvec0& c0, const _Tpvec0& d0, \
2983  hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2984 { \
2985  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2986  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2987  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2988  _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2989  v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2990 }
2991 
2992 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2993 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2994 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2995 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2996 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2997 
2998 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2999 {
3000  return v_float32x4(_mm_cvtepi32_ps(a.val));
3001 }
3002 
3003 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
3004 {
3005  return v_float32x4(_mm_cvtpd_ps(a.val));
3006 }
3007 
3008 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
3009 {
3010  return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
3011 }
3012 
3013 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
3014 {
3015  return v_float64x2(_mm_cvtepi32_pd(a.val));
3016 }
3017 
3018 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
3019 {
3020  return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
3021 }
3022 
3023 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
3024 {
3025  return v_float64x2(_mm_cvtps_pd(a.val));
3026 }
3027 
3028 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
3029 {
3030  return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
3031 }
3032 
3033 // from (Mysticial and wim) https://stackoverflow.com/q/41144668
3034 inline v_float64x2 v_cvt_f64(const v_int64x2& v)
3035 {
3036  // constants encoded as floating-point
3037  __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
3038  __m128i magic_i_all = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
3039  __m128d magic_d_all = _mm_castsi128_pd(magic_i_all);
3040  // Blend the 32 lowest significant bits of v with magic_int_lo
3041 #if CV_SSE4_1
3042  __m128i magic_i_lo = _mm_set1_epi64x(0x4330000000000000); // 2^52
3043  __m128i v_lo = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
3044 #else
3045  __m128i magic_i_lo = _mm_set1_epi32(0x43300000); // 2^52
3046  __m128i v_lo = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
3047 #endif
3048  // Extract the 32 most significant bits of v
3049  __m128i v_hi = _mm_srli_epi64(v.val, 32);
3050  // Flip the msb of v_hi and blend with 0x45300000
3051  v_hi = _mm_xor_si128(v_hi, magic_i_hi32);
3052  // Compute in double precision
3053  __m128d v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
3054  // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
3055  __m128d result = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
3056  return v_float64x2(result);
3057 }
3058 
3060 
3061 inline v_int8x16 v_lut(const schar* tab, const int* idx)
3062 {
3063 #if defined(_MSC_VER)
3064  return v_int8x16(_mm_setr_epi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
3065  tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
3066 #else
3067  return v_int8x16(_mm_setr_epi64(
3068  _mm_setr_pi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]]),
3069  _mm_setr_pi8(tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])
3070  ));
3071 #endif
3072 }
3073 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
3074 {
3075 #if defined(_MSC_VER)
3076  return v_int8x16(_mm_setr_epi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]),
3077  *(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
3078 #else
3079  return v_int8x16(_mm_setr_epi64(
3080  _mm_setr_pi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3])),
3081  _mm_setr_pi16(*(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7]))
3082  ));
3083 #endif
3084 }
3085 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
3086 {
3087 #if defined(_MSC_VER)
3088  return v_int8x16(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
3089  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
3090 #else
3091  return v_int8x16(_mm_setr_epi64(
3092  _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
3093  _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
3094  ));
3095 #endif
3096 }
3097 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
3098 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
3099 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
3100 
3101 inline v_int16x8 v_lut(const short* tab, const int* idx)
3102 {
3103 #if defined(_MSC_VER)
3104  return v_int16x8(_mm_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
3105  tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
3106 #else
3107  return v_int16x8(_mm_setr_epi64(
3108  _mm_setr_pi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]),
3109  _mm_setr_pi16(tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])
3110  ));
3111 #endif
3112 }
3113 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
3114 {
3115 #if defined(_MSC_VER)
3116  return v_int16x8(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
3117  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
3118 #else
3119  return v_int16x8(_mm_setr_epi64(
3120  _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
3121  _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
3122  ));
3123 #endif
3124 }
3125 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
3126 {
3127  return v_int16x8(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
3128 }
3129 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
3130 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
3131 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
3132 
3133 inline v_int32x4 v_lut(const int* tab, const int* idx)
3134 {
3135 #if defined(_MSC_VER)
3136  return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]],
3137  tab[idx[2]], tab[idx[3]]));
3138 #else
3139  return v_int32x4(_mm_setr_epi64(
3140  _mm_setr_pi32(tab[idx[0]], tab[idx[1]]),
3141  _mm_setr_pi32(tab[idx[2]], tab[idx[3]])
3142  ));
3143 #endif
3144 }
3145 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
3146 {
3147  return v_int32x4(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
3148 }
3149 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
3150 {
3151  return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
3152 }
3153 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
3154 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
3155 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
3156 
3157 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
3158 {
3159  return v_int64x2(_mm_set_epi64x(tab[idx[1]], tab[idx[0]]));
3160 }
3161 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
3162 {
3163  return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
3164 }
3165 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
3166 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
3167 
3168 inline v_float32x4 v_lut(const float* tab, const int* idx)
3169 {
3170  return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3171 }
3172 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
3173 inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
3174 
3175 inline v_float64x2 v_lut(const double* tab, const int* idx)
3176 {
3177  return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3178 }
3179 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); }
3180 
3181 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
3182 {
3183  int CV_DECL_ALIGNED(32) idx[4];
3184  v_store_aligned(idx, idxvec);
3185  return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3186 }
3187 
3188 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
3189 {
3190  return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
3191 }
3192 
3193 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
3194 {
3195  int CV_DECL_ALIGNED(32) idx[4];
3196  v_store_aligned(idx, idxvec);
3197  return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3198 }
3199 
3200 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
3201 {
3202  int idx[2];
3203  v_store_low(idx, idxvec);
3204  return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3205 }
3206 
3207 // loads pairs from the table and deinterleaves them, e.g. returns:
3208 // x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
3209 // y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
3210 // note that the indices are float's indices, not the float-pair indices.
3211 // in theory, this function can be used to implement bilinear interpolation,
3212 // when idxvec are the offsets within the image.
3213 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
3214 {
3215  int CV_DECL_ALIGNED(32) idx[4];
3216  v_store_aligned(idx, idxvec);
3217  __m128 z = _mm_setzero_ps();
3218  __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
3219  __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
3220  xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
3221  xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
3222  __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
3223  __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
3224  x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
3225  y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
3226 }
3227 
3228 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
3229 {
3230  int idx[2];
3231  v_store_low(idx, idxvec);
3232  __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
3233  __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
3234  x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
3235  y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
3236 }
3237 
3238 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
3239 {
3240 #if CV_SSSE3
3241  return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200)));
3242 #else
3243  __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3244  a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0));
3245  a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
3246  return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3247 #endif
3248 }
3249 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
3250 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
3251 {
3252 #if CV_SSSE3
3253  return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400)));
3254 #else
3255  __m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3256  return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3257 #endif
3258 }
3259 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
3260 
3261 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
3262 {
3263 #if CV_SSSE3
3264  return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100)));
3265 #else
3266  __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3267  return v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)));
3268 #endif
3269 }
3270 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
3271 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
3272 {
3273 #if CV_SSSE3
3274  return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100)));
3275 #else
3276  return v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val)));
3277 #endif
3278 }
3279 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
3280 
3281 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
3282 {
3283  return v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
3284 }
3285 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
3286 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
3287 
3288 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
3289 {
3290 #if CV_SSSE3
3291  return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
3292 #else
3293  __m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
3294  __m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
3295  return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3296 #endif
3297 }
3298 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
3299 
3300 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
3301 {
3302 #if CV_SSSE3
3303  return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100)));
3304 #else
3305  return v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3306 #endif
3307 }
3308 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
3309 
3310 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
3311 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
3312 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
3313 
3314 template<int i>
3315 inline uchar v_extract_n(const v_uint8x16& v)
3316 {
3317 #if CV_SSE4_1
3318  return (uchar)_mm_extract_epi8(v.val, i);
3319 #else
3320  return v_rotate_right<i>(v).get0();
3321 #endif
3322 }
3323 
3324 template<int i>
3325 inline schar v_extract_n(const v_int8x16& v)
3326 {
3327  return (schar)v_extract_n<i>(v_reinterpret_as_u8(v));
3328 }
3329 
3330 template<int i>
3331 inline ushort v_extract_n(const v_uint16x8& v)
3332 {
3333  return (ushort)_mm_extract_epi16(v.val, i);
3334 }
3335 
3336 template<int i>
3337 inline short v_extract_n(const v_int16x8& v)
3338 {
3339  return (short)v_extract_n<i>(v_reinterpret_as_u16(v));
3340 }
3341 
3342 template<int i>
3343 inline uint v_extract_n(const v_uint32x4& v)
3344 {
3345 #if CV_SSE4_1
3346  return (uint)_mm_extract_epi32(v.val, i);
3347 #else
3348  return v_rotate_right<i>(v).get0();
3349 #endif
3350 }
3351 
3352 template<int i>
3353 inline int v_extract_n(const v_int32x4& v)
3354 {
3355  return (int)v_extract_n<i>(v_reinterpret_as_u32(v));
3356 }
3357 
3358 template<int i>
3359 inline uint64 v_extract_n(const v_uint64x2& v)
3360 {
3361 #ifdef CV__SIMD_NATIVE_mm_extract_epi64
3362  return (uint64)_v128_extract_epi64<i>(v.val);
3363 #else
3364  return v_rotate_right<i>(v).get0();
3365 #endif
3366 }
3367 
3368 template<int i>
3369 inline int64 v_extract_n(const v_int64x2& v)
3370 {
3371  return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
3372 }
3373 
3374 template<int i>
3375 inline float v_extract_n(const v_float32x4& v)
3376 {
3377  union { uint iv; float fv; } d;
3378  d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
3379  return d.fv;
3380 }
3381 
3382 template<int i>
3383 inline double v_extract_n(const v_float64x2& v)
3384 {
3385  union { uint64 iv; double dv; } d;
3386  d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
3387  return d.dv;
3388 }
3389 
3390 template<int i>
3391 inline v_int32x4 v_broadcast_element(const v_int32x4& v)
3392 {
3393  return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3394 }
3395 
3396 template<int i>
3397 inline v_uint32x4 v_broadcast_element(const v_uint32x4& v)
3398 {
3399  return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3400 }
3401 
3402 template<int i>
3404 {
3405  return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((char)i,(char)i,(char)i,(char)i)));
3406 }
3407 
3409 
3410 inline v_float32x4 v_load_expand(const hfloat* ptr)
3411 {
3412 #if CV_FP16
3413  return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
3414 #else
3415  const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
3416  const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
3417  const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
3418  __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
3419  __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
3420  __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
3421  __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
3422 
3423  t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
3424  __m128i zmask = _mm_cmpeq_epi32(e, z);
3425  __m128i ft = v_select_si128(zmask, zt, t);
3426  return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
3427 #endif
3428 }
3429 
3430 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
3431 {
3432 #if CV_FP16
3433  __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
3434  _mm_storel_epi64((__m128i*)ptr, fp16_value);
3435 #else
3436  const __m128i signmask = _mm_set1_epi32(0x80000000);
3437  const __m128i rval = _mm_set1_epi32(0x3f000000);
3438 
3439  __m128i t = _mm_castps_si128(v.val);
3440  __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
3441  t = _mm_andnot_si128(signmask, t);
3442 
3443  __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
3444  __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
3445  __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
3446  __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
3447  __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
3448  tt = _mm_sub_epi32(tt, rval);
3449  __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
3450  __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
3451  nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
3452  t = v_select_si128(tinymask, tt, nt);
3453  t = v_select_si128(finitemask, t, naninf);
3454  t = _mm_or_si128(t, sign);
3455  t = _mm_packs_epi32(t, t);
3456  _mm_storel_epi64((__m128i*)ptr, t);
3457 #endif
3458 }
3459 
3460 inline void v_cleanup() {}
3461 
3462 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3463 
3465 
3466 }
3467 
3468 #endif
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition: intrin_cpp.hpp:1515
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2216
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition: intrin_cpp.hpp:2761
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition: intrin_cpp.hpp:1496
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract.
Definition: intrin_cpp.hpp:2371
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CvRect r
Definition: imgproc_c.h:984
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
T isnan(T... args)
T max(T... args)
T min(T... args)
StoreMode
Definition: intrin.hpp:100
@ STORE_ALIGNED_NOCACHE
Definition: intrin.hpp:103
@ STORE_ALIGNED
Definition: intrin.hpp:102
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437