EstervQrCode 1.1.1
Library for qr code manipulation
intrin_avx.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4 
5 #ifndef OPENCV_HAL_INTRIN_AVX_HPP
6 #define OPENCV_HAL_INTRIN_AVX_HPP
7 
8 #define CV_SIMD256 1
9 #define CV_SIMD256_64F 1
10 #define CV_SIMD256_FP16 0 // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
11 
12 namespace cv
13 {
14 
16 
17 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
18 
20 
21 inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
22 { return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
23 
24 inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
25 { return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
26 
27 inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
28 { return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
29 
30 inline int _v_cvtsi256_si32(const __m256i& a)
31 { return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
32 
33 inline __m256i _v256_shuffle_odd_64(const __m256i& v)
34 { return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
35 
36 inline __m256d _v256_shuffle_odd_64(const __m256d& v)
37 { return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
38 
39 template<int imm>
40 inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
41 { return _mm256_permute2x128_si256(a, b, imm); }
42 
43 template<int imm>
44 inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
45 { return _mm256_permute2f128_ps(a, b, imm); }
46 
47 template<int imm>
48 inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
49 { return _mm256_permute2f128_pd(a, b, imm); }
50 
51 template<int imm, typename _Tpvec>
52 inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
53 { return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
54 
55 template<int imm>
56 inline __m256i _v256_permute4x64(const __m256i& a)
57 { return _mm256_permute4x64_epi64(a, imm); }
58 
59 template<int imm>
60 inline __m256d _v256_permute4x64(const __m256d& a)
61 { return _mm256_permute4x64_pd(a, imm); }
62 
63 template<int imm, typename _Tpvec>
64 inline _Tpvec v256_permute4x64(const _Tpvec& a)
65 { return _Tpvec(_v256_permute4x64<imm>(a.val)); }
66 
67 inline __m128i _v256_extract_high(const __m256i& v)
68 { return _mm256_extracti128_si256(v, 1); }
69 
70 inline __m128 _v256_extract_high(const __m256& v)
71 { return _mm256_extractf128_ps(v, 1); }
72 
73 inline __m128d _v256_extract_high(const __m256d& v)
74 { return _mm256_extractf128_pd(v, 1); }
75 
76 inline __m128i _v256_extract_low(const __m256i& v)
77 { return _mm256_castsi256_si128(v); }
78 
79 inline __m128 _v256_extract_low(const __m256& v)
80 { return _mm256_castps256_ps128(v); }
81 
82 inline __m128d _v256_extract_low(const __m256d& v)
83 { return _mm256_castpd256_pd128(v); }
84 
85 inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
86 {
87  const __m256i m = _mm256_set1_epi32(65535);
88  __m256i am = _mm256_min_epu32(a, m);
89  __m256i bm = _mm256_min_epu32(b, m);
90  return _mm256_packus_epi32(am, bm);
91 }
92 
93 template<int i>
94 inline int _v256_extract_epi8(const __m256i& a)
95 {
96 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
97  return _mm256_extract_epi8(a, i);
98 #else
99  __m128i b = _mm256_extractf128_si256(a, ((i) >> 4));
100  return _mm_extract_epi8(b, i & 15); // SSE4.1
101 #endif
102 }
103 
104 template<int i>
105 inline int _v256_extract_epi16(const __m256i& a)
106 {
107 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
108  return _mm256_extract_epi16(a, i);
109 #else
110  __m128i b = _mm256_extractf128_si256(a, ((i) >> 3));
111  return _mm_extract_epi16(b, i & 7); // SSE2
112 #endif
113 }
114 
115 template<int i>
116 inline int _v256_extract_epi32(const __m256i& a)
117 {
118 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
119  return _mm256_extract_epi32(a, i);
120 #else
121  __m128i b = _mm256_extractf128_si256(a, ((i) >> 2));
122  return _mm_extract_epi32(b, i & 3); // SSE4.1
123 #endif
124 }
125 
126 template<int i>
127 inline int64 _v256_extract_epi64(const __m256i& a)
128 {
129 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
130  return _mm256_extract_epi64(a, i);
131 #else
132  __m128i b = _mm256_extractf128_si256(a, ((i) >> 1));
133  return _mm_extract_epi64(b, i & 1); // SSE4.1
134 #endif
135 }
136 
138 
139 struct v_uint8x32
140 {
141  typedef uchar lane_type;
142  enum { nlanes = 32 };
143  __m256i val;
144 
145  explicit v_uint8x32(__m256i v) : val(v) {}
146  v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3,
147  uchar v4, uchar v5, uchar v6, uchar v7,
148  uchar v8, uchar v9, uchar v10, uchar v11,
149  uchar v12, uchar v13, uchar v14, uchar v15,
150  uchar v16, uchar v17, uchar v18, uchar v19,
151  uchar v20, uchar v21, uchar v22, uchar v23,
152  uchar v24, uchar v25, uchar v26, uchar v27,
153  uchar v28, uchar v29, uchar v30, uchar v31)
154  {
155  val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
156  (char)v4, (char)v5, (char)v6 , (char)v7, (char)v8, (char)v9,
157  (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
158  (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
159  (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
160  (char)v28, (char)v29, (char)v30, (char)v31);
161  }
162  /* coverity[uninit_ctor]: suppress warning */
163  v_uint8x32() {}
164 
165  uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
166 };
167 
168 struct v_int8x32
169 {
170  typedef schar lane_type;
171  enum { nlanes = 32 };
172  __m256i val;
173 
174  explicit v_int8x32(__m256i v) : val(v) {}
175  v_int8x32(schar v0, schar v1, schar v2, schar v3,
176  schar v4, schar v5, schar v6, schar v7,
177  schar v8, schar v9, schar v10, schar v11,
178  schar v12, schar v13, schar v14, schar v15,
179  schar v16, schar v17, schar v18, schar v19,
180  schar v20, schar v21, schar v22, schar v23,
181  schar v24, schar v25, schar v26, schar v27,
182  schar v28, schar v29, schar v30, schar v31)
183  {
184  val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
185  v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
186  v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
187  }
188  /* coverity[uninit_ctor]: suppress warning */
189  v_int8x32() {}
190 
191  schar get0() const { return (schar)_v_cvtsi256_si32(val); }
192 };
193 
194 struct v_uint16x16
195 {
196  typedef ushort lane_type;
197  enum { nlanes = 16 };
198  __m256i val;
199 
200  explicit v_uint16x16(__m256i v) : val(v) {}
201  v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3,
202  ushort v4, ushort v5, ushort v6, ushort v7,
203  ushort v8, ushort v9, ushort v10, ushort v11,
204  ushort v12, ushort v13, ushort v14, ushort v15)
205  {
206  val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
207  (short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9,
208  (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
209  }
210  /* coverity[uninit_ctor]: suppress warning */
211  v_uint16x16() {}
212 
213  ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
214 };
215 
216 struct v_int16x16
217 {
218  typedef short lane_type;
219  enum { nlanes = 16 };
220  __m256i val;
221 
222  explicit v_int16x16(__m256i v) : val(v) {}
223  v_int16x16(short v0, short v1, short v2, short v3,
224  short v4, short v5, short v6, short v7,
225  short v8, short v9, short v10, short v11,
226  short v12, short v13, short v14, short v15)
227  {
228  val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
229  v8, v9, v10, v11, v12, v13, v14, v15);
230  }
231  /* coverity[uninit_ctor]: suppress warning */
232  v_int16x16() {}
233 
234  short get0() const { return (short)_v_cvtsi256_si32(val); }
235 };
236 
237 struct v_uint32x8
238 {
239  typedef unsigned lane_type;
240  enum { nlanes = 8 };
241  __m256i val;
242 
243  explicit v_uint32x8(__m256i v) : val(v) {}
244  v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
245  unsigned v4, unsigned v5, unsigned v6, unsigned v7)
246  {
247  val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
248  (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
249  }
250  /* coverity[uninit_ctor]: suppress warning */
251  v_uint32x8() {}
252 
253  unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
254 };
255 
256 struct v_int32x8
257 {
258  typedef int lane_type;
259  enum { nlanes = 8 };
260  __m256i val;
261 
262  explicit v_int32x8(__m256i v) : val(v) {}
263  v_int32x8(int v0, int v1, int v2, int v3,
264  int v4, int v5, int v6, int v7)
265  {
266  val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
267  }
268  /* coverity[uninit_ctor]: suppress warning */
269  v_int32x8() {}
270 
271  int get0() const { return _v_cvtsi256_si32(val); }
272 };
273 
274 struct v_float32x8
275 {
276  typedef float lane_type;
277  enum { nlanes = 8 };
278  __m256 val;
279 
280  explicit v_float32x8(__m256 v) : val(v) {}
281  v_float32x8(float v0, float v1, float v2, float v3,
282  float v4, float v5, float v6, float v7)
283  {
284  val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
285  }
286  /* coverity[uninit_ctor]: suppress warning */
287  v_float32x8() {}
288 
289  float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
290 };
291 
292 struct v_uint64x4
293 {
294  typedef uint64 lane_type;
295  enum { nlanes = 4 };
296  __m256i val;
297 
298  explicit v_uint64x4(__m256i v) : val(v) {}
299  v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
300  { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
301  /* coverity[uninit_ctor]: suppress warning */
302  v_uint64x4() {}
303 
304  uint64 get0() const
305  {
306  #if defined __x86_64__ || defined _M_X64
307  return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
308  #else
309  int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
310  int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
311  return (unsigned)a | ((uint64)(unsigned)b << 32);
312  #endif
313  }
314 };
315 
316 struct v_int64x4
317 {
318  typedef int64 lane_type;
319  enum { nlanes = 4 };
320  __m256i val;
321 
322  explicit v_int64x4(__m256i v) : val(v) {}
323  v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
324  { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
325  /* coverity[uninit_ctor]: suppress warning */
326  v_int64x4() {}
327 
328  int64 get0() const
329  {
330  #if defined __x86_64__ || defined _M_X64
331  return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
332  #else
333  int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
334  int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
335  return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
336  #endif
337  }
338 };
339 
340 struct v_float64x4
341 {
342  typedef double lane_type;
343  enum { nlanes = 4 };
344  __m256d val;
345 
346  explicit v_float64x4(__m256d v) : val(v) {}
347  v_float64x4(double v0, double v1, double v2, double v3)
348  { val = _mm256_setr_pd(v0, v1, v2, v3); }
349  /* coverity[uninit_ctor]: suppress warning */
350  v_float64x4() {}
351 
352  double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
353 };
354 
356 
357 #define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp) \
358  inline _Tpvec v256_load(const _Tp* ptr) \
359  { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); } \
360  inline _Tpvec v256_load_aligned(const _Tp* ptr) \
361  { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); } \
362  inline _Tpvec v256_load_low(const _Tp* ptr) \
363  { \
364  __m128i v128 = _mm_loadu_si128((const __m128i*)ptr); \
365  return _Tpvec(_mm256_castsi128_si256(v128)); \
366  } \
367  inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
368  { \
369  __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0); \
370  __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1); \
371  return _Tpvec(_v256_combine(vlo, vhi)); \
372  } \
373  inline void v_store(_Tp* ptr, const _Tpvec& a) \
374  { _mm256_storeu_si256((__m256i*)ptr, a.val); } \
375  inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
376  { _mm256_store_si256((__m256i*)ptr, a.val); } \
377  inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
378  { _mm256_stream_si256((__m256i*)ptr, a.val); } \
379  inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
380  { \
381  if( mode == hal::STORE_UNALIGNED ) \
382  _mm256_storeu_si256((__m256i*)ptr, a.val); \
383  else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
384  _mm256_stream_si256((__m256i*)ptr, a.val); \
385  else \
386  _mm256_store_si256((__m256i*)ptr, a.val); \
387  } \
388  inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
389  { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \
390  inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
391  { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
392 
393 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32, uchar)
394 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32, schar)
395 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
396 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16, short)
397 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8, unsigned)
398 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8, int)
399 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4, uint64)
400 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4, int64)
401 
402 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
403  inline _Tpvec v256_load(const _Tp* ptr) \
404  { return _Tpvec(_mm256_loadu_##suffix(ptr)); } \
405  inline _Tpvec v256_load_aligned(const _Tp* ptr) \
406  { return _Tpvec(_mm256_load_##suffix(ptr)); } \
407  inline _Tpvec v256_load_low(const _Tp* ptr) \
408  { \
409  return _Tpvec(_mm256_cast##suffix##128_##suffix##256 \
410  (_mm_loadu_##suffix(ptr))); \
411  } \
412  inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
413  { \
414  halfreg vlo = _mm_loadu_##suffix(ptr0); \
415  halfreg vhi = _mm_loadu_##suffix(ptr1); \
416  return _Tpvec(_v256_combine(vlo, vhi)); \
417  } \
418  inline void v_store(_Tp* ptr, const _Tpvec& a) \
419  { _mm256_storeu_##suffix(ptr, a.val); } \
420  inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
421  { _mm256_store_##suffix(ptr, a.val); } \
422  inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
423  { _mm256_stream_##suffix(ptr, a.val); } \
424  inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
425  { \
426  if( mode == hal::STORE_UNALIGNED ) \
427  _mm256_storeu_##suffix(ptr, a.val); \
428  else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
429  _mm256_stream_##suffix(ptr, a.val); \
430  else \
431  _mm256_store_##suffix(ptr, a.val); \
432  } \
433  inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
434  { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \
435  inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
436  { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
437 
438 OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float, ps, __m128)
439 OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
440 
441 #define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
442  inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
443  { return _Tpvec(cast(a.val)); }
444 
445 #define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
446  inline _Tpvec v256_setzero_##suffix() \
447  { return _Tpvec(_mm256_setzero_si256()); } \
448  inline _Tpvec v256_setall_##suffix(_Tp v) \
449  { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \
450  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
451  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
452  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
453  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
454  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
455  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
456  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
457  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
458  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256) \
459  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
460 
461 OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32, uchar, u8, epi8, char)
462 OPENCV_HAL_IMPL_AVX_INIT(v_int8x32, schar, s8, epi8, char)
463 OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort, u16, epi16, short)
464 OPENCV_HAL_IMPL_AVX_INIT(v_int16x16, short, s16, epi16, short)
465 OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8, unsigned, u32, epi32, int)
466 OPENCV_HAL_IMPL_AVX_INIT(v_int32x8, int, s32, epi32, int)
467 OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4, uint64, u64, epi64x, int64)
468 OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64)
469 
470 #define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
471  inline _Tpvec v256_setzero_##suffix() \
472  { return _Tpvec(_mm256_setzero_##zsuffix()); } \
473  inline _Tpvec v256_setall_##suffix(_Tp v) \
474  { return _Tpvec(_mm256_set1_##zsuffix(v)); } \
475  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
476  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \
477  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
478  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, cast) \
479  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
480  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, cast) \
481  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
482  OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, cast)
483 
484 OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float, f32, ps, _mm256_castsi256_ps)
485 OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
486 
487 inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
488 { return a; }
489 inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
490 { return v_float32x8(_mm256_castpd_ps(a.val)); }
491 
492 inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
493 { return a; }
494 inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
495 { return v_float64x4(_mm256_castps_pd(a.val)); }
496 
497 /* Recombine */
498 /*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm) \
499  inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
500  { return _Tpvec(perm(a.val, b.val, 0x20)); } \
501  inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
502  { return _Tpvec(perm(a.val, b.val, 0x31)); } \
503  inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
504  _Tpvec& c, _Tpvec& d) \
505  { c = v_combine_low(a, b); d = v_combine_high(a, b); }
506 
507 #define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix) \
508  OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256) \
509  inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, \
510  _Tpvec& b0, _Tpvec& b1) \
511  { \
512  __m256i v0 = _v256_shuffle_odd_64(a0.val); \
513  __m256i v1 = _v256_shuffle_odd_64(a1.val); \
514  b0.val = _mm256_unpacklo_##suffix(v0, v1); \
515  b1.val = _mm256_unpackhi_##suffix(v0, v1); \
516  }
517 
518 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32, epi8)
519 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32, epi8)
520 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
521 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16, epi16)
522 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8, epi32)
523 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8, epi32)
524 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4, epi64)
525 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4, epi64)
526 OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
527 OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
528 
529 inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
530 {
531  __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
532  __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
533  v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
534 }
535 
536 inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
537 {
538  __m256d v0 = _v_shuffle_odd_64(a0.val);
539  __m256d v1 = _v_shuffle_odd_64(a1.val);
540  b0.val = _mm256_unpacklo_pd(v0, v1);
541  b1.val = _mm256_unpackhi_pd(v0, v1);
542 }*/
543 
545 
546 // unpacks
547 #define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix) \
548  inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
549  { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); } \
550  inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
551  { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
552 
553 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32, epi8)
554 OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32, epi8)
555 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
556 OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16, epi16)
557 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8, epi32)
558 OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8, epi32)
559 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4, epi64)
560 OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4, epi64)
561 OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
562 OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
563 
564 // blend
565 #define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix) \
566  template<int m> \
567  inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b) \
568  { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
569 
570 OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
571 OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16, epi16)
572 OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8, epi32)
573 OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8, epi32)
574 OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
575 OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
576 
577 template<int m>
578 inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
579 {
580  enum {M0 = m};
581  enum {M1 = (M0 | (M0 << 2)) & 0x33};
582  enum {M2 = (M1 | (M1 << 1)) & 0x55};
583  enum {MM = M2 | (M2 << 1)};
584  return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
585 }
586 template<int m>
587 inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
588 { return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
589 
590 // shuffle
591 // todo: emulate 64bit
592 #define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin) \
593  template<int m> \
594  inline _Tpvec v256_shuffle(const _Tpvec& a) \
595  { return _Tpvec(_mm256_##intrin(a.val, m)); }
596 
597 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8, shuffle_epi32)
598 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8, shuffle_epi32)
599 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
600 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
601 
602 template<typename _Tpvec>
603 inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
604 {
605  ab0 = v256_unpacklo(a, b);
606  ab1 = v256_unpackhi(a, b);
607 }
608 
609 template<typename _Tpvec>
610 inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
611 { return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }
612 
613 inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
614 { return v256_blend<0xf0>(a, b); }
615 
616 inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
617 { return v256_blend<0xc>(a, b); }
618 
619 template<typename _Tpvec>
620 inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
621 { return v256_permute2x128<0x21>(a, b); }
622 
623 template<typename _Tpvec>
624 inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
625 { return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
626 inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
627 { return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
628 // todo: emulate float32
629 
630 template<typename _Tpvec>
631 inline _Tpvec v256_swap_halves(const _Tpvec& a)
632 { return v256_permute2x128<1>(a, a); }
633 
634 template<typename _Tpvec>
635 inline _Tpvec v256_reverse_64(const _Tpvec& a)
636 { return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
637 
638 // ZIP
639 #define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec) \
640  inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
641  { return v256_permute2x128<0x20>(a, b); } \
642  inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
643  { return v256_permute2x128<0x31>(a, b); } \
644  inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
645  _Tpvec& c, _Tpvec& d) \
646  { \
647  _Tpvec a1b0 = v256_alignr_128(a, b); \
648  c = v256_combine_diagonal(a, a1b0); \
649  d = v256_combine_diagonal(a1b0, b); \
650  } \
651  inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
652  _Tpvec& ab0, _Tpvec& ab1) \
653  { \
654  _Tpvec ab0ab2, ab1ab3; \
655  v256_zip(a, b, ab0ab2, ab1ab3); \
656  v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
657  }
658 
659 OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
660 OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
661 OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
662 OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
663 OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
664 OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
665 OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
666 OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
667 OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
668 OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
669 
670 
672 /* Element-wise binary and unary operations */
673 
674 
675 #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
676  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
677  { return _Tpvec(intrin(a.val, b.val)); } \
678  inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
679  { a.val = intrin(a.val, b.val); return a; }
680 
681 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
682 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
683 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
684 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
685 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
686 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
687 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
688 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
689 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
690 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
691 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
692 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
693 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
694 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
695 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
696 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
697 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
698 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
699 
700 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
701 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
702 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
703 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
704 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
705 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
706 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
707 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
708 
709 // saturating multiply 8-bit, 16-bit
710 inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
711 {
712  v_uint16x16 c, d;
713  v_mul_expand(a, b, c, d);
714  return v_pack(c, d);
715 }
716 inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
717 {
718  v_int16x16 c, d;
719  v_mul_expand(a, b, c, d);
720  return v_pack(c, d);
721 }
722 inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
723 {
724  __m256i pl = _mm256_mullo_epi16(a.val, b.val);
725  __m256i ph = _mm256_mulhi_epu16(a.val, b.val);
726  __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
727  __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
728  return v_uint16x16(_v256_packs_epu32(p0, p1));
729 }
730 inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
731 {
732  __m256i pl = _mm256_mullo_epi16(a.val, b.val);
733  __m256i ph = _mm256_mulhi_epi16(a.val, b.val);
734  __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
735  __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
736  return v_int16x16(_mm256_packs_epi32(p0, p1));
737 }
738 inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
739 { a = a * b; return a; }
740 inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
741 { a = a * b; return a; }
742 inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
743 { a = a * b; return a; }
744 inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
745 { a = a * b; return a; }
746 
748 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
749  inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
750  { return _Tpvec(intrin(a.val, b.val)); }
751 
752 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
753 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
754 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
755 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
756 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
757 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
758 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
759 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
760 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
761 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16, _mm256_mullo_epi16)
762 
763 inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
764 {
765  __m256i ad = _mm256_srai_epi16(a.val, 8);
766  __m256i bd = _mm256_srai_epi16(b.val, 8);
767  __m256i p0 = _mm256_mullo_epi16(a.val, b.val); // even
768  __m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8); // odd
769 
770  const __m256i b01 = _mm256_set1_epi32(0xFF00FF00);
771  return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
772 }
773 inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
774 {
775  return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
776 }
777 
778 // Multiply and expand
779 inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
780  v_uint16x16& c, v_uint16x16& d)
781 {
782  v_uint16x16 a0, a1, b0, b1;
783  v_expand(a, a0, a1);
784  v_expand(b, b0, b1);
785  c = v_mul_wrap(a0, b0);
786  d = v_mul_wrap(a1, b1);
787 }
788 
789 inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
790  v_int16x16& c, v_int16x16& d)
791 {
792  v_int16x16 a0, a1, b0, b1;
793  v_expand(a, a0, a1);
794  v_expand(b, b0, b1);
795  c = v_mul_wrap(a0, b0);
796  d = v_mul_wrap(a1, b1);
797 }
798 
799 inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
800  v_int32x8& c, v_int32x8& d)
801 {
802  v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
803 
804  v_int16x16 v0, v1;
805  v_zip(v_mul_wrap(a, b), vhi, v0, v1);
806 
807  c = v_reinterpret_as_s32(v0);
808  d = v_reinterpret_as_s32(v1);
809 }
810 
811 inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
812  v_uint32x8& c, v_uint32x8& d)
813 {
814  v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
815 
816  v_uint16x16 v0, v1;
817  v_zip(v_mul_wrap(a, b), vhi, v0, v1);
818 
819  c = v_reinterpret_as_u32(v0);
820  d = v_reinterpret_as_u32(v1);
821 }
822 
823 inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
824  v_uint64x4& c, v_uint64x4& d)
825 {
826  __m256i v0 = _mm256_mul_epu32(a.val, b.val);
827  __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
828  v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
829 }
830 
831 inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
832 inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
833 
835 #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
836  inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
837  { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
838  inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
839  { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
840  inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
841  { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
842  inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
843  { return _Tpsvec(srai(a.val, imm)); } \
844  template<int imm> \
845  inline _Tpuvec v_shl(const _Tpuvec& a) \
846  { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
847  template<int imm> \
848  inline _Tpsvec v_shl(const _Tpsvec& a) \
849  { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
850  template<int imm> \
851  inline _Tpuvec v_shr(const _Tpuvec& a) \
852  { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
853  template<int imm> \
854  inline _Tpsvec v_shr(const _Tpsvec& a) \
855  { return _Tpsvec(srai(a.val, imm)); }
856 
857 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
858 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8, v_int32x8, epi32, _mm256_srai_epi32)
859 
860 inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
861 {
862  __m256i d = _mm256_set1_epi64x((int64)1 << 63);
863  __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
864  return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
865 }
866 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx)
867 
868 
869 
870 #define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
871  OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
872  OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
873  OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
874  inline _Tpvec operator ~ (const _Tpvec& a) \
875  { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
876 
877 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
878 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32, si256, _mm256_set1_epi32(-1))
879 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16, si256, _mm256_set1_epi32(-1))
880 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16, si256, _mm256_set1_epi32(-1))
881 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8, si256, _mm256_set1_epi32(-1))
882 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8, si256, _mm256_set1_epi32(-1))
883 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4, si256, _mm256_set1_epi64x(-1))
884 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4, si256, _mm256_set1_epi64x(-1))
885 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8, ps, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
886 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4, pd, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
887 
889 #define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix) \
890  inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
891  { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
892 
893 OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32, epi8)
894 OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32, epi8)
895 OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
896 OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16, epi8)
897 OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8, epi8)
898 OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8, epi8)
899 OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
900 OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
901 
902 
903 #define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
904  inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
905  { return ~(a == b); } \
906  inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
907  { return b > a; } \
908  inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
909  { return ~(a < b); } \
910  inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
911  { return b >= a; }
912 
913 #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
914  inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
915  { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
916  inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
917  { \
918  __m256i smask = _mm256_set1_##suffix(sbit); \
919  return _Tpuvec(_mm256_cmpgt_##suffix( \
920  _mm256_xor_si256(a.val, smask), \
921  _mm256_xor_si256(b.val, smask))); \
922  } \
923  inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
924  { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
925  inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
926  { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
927  OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
928  OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
929 
930 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32, v_int8x32, epi8, (char)-128)
931 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
932 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000)
933 
934 #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
935  inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
936  { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
937  inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
938  { return ~(a == b); }
939 
940 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
941 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
942 
943 #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
944  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
945  { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
946 
947 #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
948  OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
949  OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
950  OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
951  OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
952  OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
953  OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
954 
955 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
956 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
957 
958 inline v_float32x8 v_not_nan(const v_float32x8& a)
959 { return v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }
960 inline v_float64x4 v_not_nan(const v_float64x4& a)
961 { return v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }
962 
964 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8)
965 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8)
966 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32, _mm256_min_epi8)
967 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32, _mm256_max_epi8)
968 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
969 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
970 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16, _mm256_min_epi16)
971 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16, _mm256_max_epi16)
972 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8, _mm256_min_epu32)
973 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8, _mm256_max_epu32)
974 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8, _mm256_min_epi32)
975 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8, _mm256_max_epi32)
976 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
977 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
978 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
979 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
980 
981 
982 template<int imm>
983 inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
984 {
985  enum {IMM_R = (16 - imm) & 0xFF};
986  enum {IMM_R2 = (32 - imm) & 0xFF};
987 
988  if (imm == 0) return a;
989  if (imm == 32) return b;
990  if (imm > 32) return v_uint8x32();
991 
992  __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
993  if (imm == 16) return v_uint8x32(swap);
994  if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, IMM_R));
995  return v_uint8x32(_mm256_alignr_epi8(swap, b.val, IMM_R2)); // imm < 32
996 }
997 
998 template<int imm>
999 inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
1000 {
1001  enum {IMM_L = (imm - 16) & 0xFF};
1002 
1003  if (imm == 0) return a;
1004  if (imm == 32) return b;
1005  if (imm > 32) return v_uint8x32();
1006 
1007  __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
1008  if (imm == 16) return v_uint8x32(swap);
1009  if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
1010  return v_uint8x32(_mm256_alignr_epi8(b.val, swap, IMM_L));
1011 }
1012 
1013 template<int imm>
1014 inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
1015 {
1016  enum {IMM_L = (imm - 16) & 0xFF};
1017  enum {IMM_R = (16 - imm) & 0xFF};
1018 
1019  if (imm == 0) return a;
1020  if (imm > 32) return v_uint8x32();
1021 
1022  // ESAC control[3] ? [127:0] = 0
1023  __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
1024  if (imm == 16) return v_uint8x32(swapz);
1025  if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));
1026  return v_uint8x32(_mm256_slli_si256(swapz, IMM_L));
1027 }
1028 
1029 template<int imm>
1030 inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
1031 {
1032  enum {IMM_L = (imm - 16) & 0xFF};
1033 
1034  if (imm == 0) return a;
1035  if (imm > 32) return v_uint8x32();
1036 
1037  // ESAC control[3] ? [127:0] = 0
1038  __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
1039  if (imm == 16) return v_uint8x32(swapz);
1040  if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));
1041  return v_uint8x32(_mm256_srli_si256(swapz, IMM_L));
1042 }
1043 
1044 #define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \
1045  template<int imm> \
1046  inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
1047  { \
1048  enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1049  v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a), \
1050  v_reinterpret_as_u8(b)); \
1051  return _Tpvec(cast(ret.val)); \
1052  } \
1053  template<int imm> \
1054  inline _Tpvec intrin(const _Tpvec& a) \
1055  { \
1056  enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1057  v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a)); \
1058  return _Tpvec(cast(ret.val)); \
1059  }
1060 
1061 #define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec) \
1062  OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
1063  OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
1064 
1065 OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
1066 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
1067 OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
1068 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
1069 OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
1070 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
1071 OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
1072 
1073 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float32x8, _mm256_castsi256_ps)
1074 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
1075 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd)
1076 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
1077 
1078 
1079 inline v_uint8x32 v_reverse(const v_uint8x32 &a)
1080 {
1081  static const __m256i perm = _mm256_setr_epi8(
1082  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
1083  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1084  __m256i vec = _mm256_shuffle_epi8(a.val, perm);
1085  return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
1086 }
1087 
1088 inline v_int8x32 v_reverse(const v_int8x32 &a)
1089 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1090 
1091 inline v_uint16x16 v_reverse(const v_uint16x16 &a)
1092 {
1093  static const __m256i perm = _mm256_setr_epi8(
1094  14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
1095  14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1096  __m256i vec = _mm256_shuffle_epi8(a.val, perm);
1097  return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
1098 }
1099 
1100 inline v_int16x16 v_reverse(const v_int16x16 &a)
1101 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1102 
1103 inline v_uint32x8 v_reverse(const v_uint32x8 &a)
1104 {
1105  static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1106  return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
1107 }
1108 
1109 inline v_int32x8 v_reverse(const v_int32x8 &a)
1110 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1111 
1112 inline v_float32x8 v_reverse(const v_float32x8 &a)
1113 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1114 
1115 inline v_uint64x4 v_reverse(const v_uint64x4 &a)
1116 {
1117  return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
1118 }
1119 
1120 inline v_int64x4 v_reverse(const v_int64x4 &a)
1121 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1122 
1123 inline v_float64x4 v_reverse(const v_float64x4 &a)
1124 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1125 
1127 
1129 inline unsigned v_reduce_sum(const v_uint8x32& a)
1130 {
1131  __m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
1132  __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1133  return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1134 }
1135 inline int v_reduce_sum(const v_int8x32& a)
1136 {
1137  __m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
1138  __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1139  return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
1140 }
1141 #define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
1142  inline sctype v_reduce_##func(const _Tpvec& a) \
1143  { \
1144  __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
1145  val = intrin(val, _mm_srli_si128(val,8)); \
1146  val = intrin(val, _mm_srli_si128(val,4)); \
1147  val = intrin(val, _mm_srli_si128(val,2)); \
1148  val = intrin(val, _mm_srli_si128(val,1)); \
1149  return (sctype)_mm_cvtsi128_si32(val); \
1150  }
1151 
1152 OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8)
1153 OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, min, _mm_min_epi8)
1154 OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8)
1155 OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, max, _mm_max_epi8)
1156 
1157 #define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1158  inline sctype v_reduce_##func(const _Tpvec& a) \
1159  { \
1160  __m128i v0 = _v256_extract_low(a.val); \
1161  __m128i v1 = _v256_extract_high(a.val); \
1162  v0 = intrin(v0, v1); \
1163  v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
1164  v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
1165  v0 = intrin(v0, _mm_srli_si128(v0, 2)); \
1166  return (sctype) _mm_cvtsi128_si32(v0); \
1167  }
1168 
1169 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
1170 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, min, _mm_min_epi16)
1171 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
1172 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, max, _mm_max_epi16)
1173 
1174 #define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1175  inline sctype v_reduce_##func(const _Tpvec& a) \
1176  { \
1177  __m128i v0 = _v256_extract_low(a.val); \
1178  __m128i v1 = _v256_extract_high(a.val); \
1179  v0 = intrin(v0, v1); \
1180  v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
1181  v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
1182  return (sctype) _mm_cvtsi128_si32(v0); \
1183  }
1184 
1185 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
1186 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, min, _mm_min_epi32)
1187 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
1188 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32)
1189 
1190 #define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin) \
1191  inline float v_reduce_##func(const v_float32x8& a) \
1192  { \
1193  __m128 v0 = _v256_extract_low(a.val); \
1194  __m128 v1 = _v256_extract_high(a.val); \
1195  v0 = intrin(v0, v1); \
1196  v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
1197  v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 1))); \
1198  return _mm_cvtss_f32(v0); \
1199  }
1200 
1201 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
1202 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
1203 
1204 inline int v_reduce_sum(const v_int32x8& a)
1205 {
1206  __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
1207  s0 = _mm256_hadd_epi32(s0, s0);
1208 
1209  __m128i s1 = _v256_extract_high(s0);
1210  s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
1211 
1212  return _mm_cvtsi128_si32(s1);
1213 }
1214 
1215 inline unsigned v_reduce_sum(const v_uint32x8& a)
1216 { return v_reduce_sum(v_reinterpret_as_s32(a)); }
1217 
1218 inline int v_reduce_sum(const v_int16x16& a)
1219 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1220 inline unsigned v_reduce_sum(const v_uint16x16& a)
1221 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1222 
1223 inline float v_reduce_sum(const v_float32x8& a)
1224 {
1225  __m256 s0 = _mm256_hadd_ps(a.val, a.val);
1226  s0 = _mm256_hadd_ps(s0, s0);
1227 
1228  __m128 s1 = _v256_extract_high(s0);
1229  s1 = _mm_add_ps(_v256_extract_low(s0), s1);
1230 
1231  return _mm_cvtss_f32(s1);
1232 }
1233 
1234 inline uint64 v_reduce_sum(const v_uint64x4& a)
1235 {
1236  uint64 CV_DECL_ALIGNED(32) idx[2];
1237  _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
1238  return idx[0] + idx[1];
1239 }
1240 inline int64 v_reduce_sum(const v_int64x4& a)
1241 {
1242  int64 CV_DECL_ALIGNED(32) idx[2];
1243  _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
1244  return idx[0] + idx[1];
1245 }
1246 inline double v_reduce_sum(const v_float64x4& a)
1247 {
1248  __m256d s0 = _mm256_hadd_pd(a.val, a.val);
1249  return _mm_cvtsd_f64(_mm_add_pd(_v256_extract_low(s0), _v256_extract_high(s0)));
1250 }
1251 
1252 inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
1253  const v_float32x8& c, const v_float32x8& d)
1254 {
1255  __m256 ab = _mm256_hadd_ps(a.val, b.val);
1256  __m256 cd = _mm256_hadd_ps(c.val, d.val);
1257  return v_float32x8(_mm256_hadd_ps(ab, cd));
1258 }
1259 
1260 inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
1261 {
1262  __m256i half = _mm256_sad_epu8(a.val, b.val);
1263  __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1264  return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1265 }
1266 inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
1267 {
1268  __m256i half = _mm256_set1_epi8(0x7f);
1269  half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half));
1270  __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1271  return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1272 }
1273 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
1274 {
1275  v_uint32x8 l, h;
1276  v_expand(v_add_wrap(a - b, b - a), l, h);
1277  return v_reduce_sum(l + h);
1278 }
1279 inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
1280 {
1281  v_uint32x8 l, h;
1282  v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
1283  return v_reduce_sum(l + h);
1284 }
1285 inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
1286 {
1287  return v_reduce_sum(v_max(a, b) - v_min(a, b));
1288 }
1289 inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
1290 {
1291  v_int32x8 m = a < b;
1292  return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
1293 }
1294 inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
1295 {
1296  return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
1297 }
1298 
1300 inline v_uint8x32 v_popcount(const v_uint8x32& a)
1301 {
1302  __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1303  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
1304  __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
1305  return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256( a.val , _popcnt_mask)),
1306  _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
1307 }
1308 inline v_uint16x16 v_popcount(const v_uint16x16& a)
1309 {
1310  v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
1311  p += v_rotate_right<1>(p);
1312  return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
1313 }
1314 inline v_uint32x8 v_popcount(const v_uint32x8& a)
1315 {
1316  v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
1317  p += v_rotate_right<1>(p);
1318  p += v_rotate_right<2>(p);
1319  return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
1320 }
1321 inline v_uint64x4 v_popcount(const v_uint64x4& a)
1322 {
1323  return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
1324 }
1325 inline v_uint8x32 v_popcount(const v_int8x32& a)
1326 { return v_popcount(v_reinterpret_as_u8(a)); }
1327 inline v_uint16x16 v_popcount(const v_int16x16& a)
1328 { return v_popcount(v_reinterpret_as_u16(a)); }
1329 inline v_uint32x8 v_popcount(const v_int32x8& a)
1330 { return v_popcount(v_reinterpret_as_u32(a)); }
1331 inline v_uint64x4 v_popcount(const v_int64x4& a)
1332 { return v_popcount(v_reinterpret_as_u64(a)); }
1333 
1335 inline int v_signmask(const v_int8x32& a)
1336 { return _mm256_movemask_epi8(a.val); }
1337 inline int v_signmask(const v_uint8x32& a)
1338 { return v_signmask(v_reinterpret_as_s8(a)); }
1339 
1340 inline int v_signmask(const v_int16x16& a)
1341 { return v_signmask(v_pack(a, a)) & 0xFFFF; }
1342 inline int v_signmask(const v_uint16x16& a)
1343 { return v_signmask(v_reinterpret_as_s16(a)); }
1344 
1345 inline int v_signmask(const v_float32x8& a)
1346 { return _mm256_movemask_ps(a.val); }
1347 inline int v_signmask(const v_float64x4& a)
1348 { return _mm256_movemask_pd(a.val); }
1349 
1350 inline int v_signmask(const v_int32x8& a)
1351 { return v_signmask(v_reinterpret_as_f32(a)); }
1352 inline int v_signmask(const v_uint32x8& a)
1353 { return v_signmask(v_reinterpret_as_f32(a)); }
1354 
1355 inline int v_signmask(const v_int64x4& a)
1356 { return v_signmask(v_reinterpret_as_f64(a)); }
1357 inline int v_signmask(const v_uint64x4& a)
1358 { return v_signmask(v_reinterpret_as_f64(a)); }
1359 
1360 inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1361 inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1362 inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1363 inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1364 inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1365 inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1366 inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1367 inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1368 inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1369 inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1370 
1372 #define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \
1373  inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1374  inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1375 OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1)
1376 OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1)
1377 OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255)
1378 OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255)
1379 OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15)
1380 OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15)
1381 OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255)
1382 OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15)
1383 
1384 #define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec) \
1385  inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
1386  inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
1387 OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16)
1388 OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
1389 
1390 
1393 #if CV_FMA3
1394 #define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
1395  inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1396  { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1397  inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1398  { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }
1399 #else
1400 #define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
1401  inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1402  { return _Tpvec(_mm256_add_##suffix(_mm256_mul_##suffix(a.val, b.val), c.val)); } \
1403  inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1404  { return _Tpvec(_mm256_add_##suffix(_mm256_mul_##suffix(a.val, b.val), c.val)); }
1405 #endif
1406 
1407 #define OPENCV_HAL_IMPL_AVX_MISC(_Tpvec, suffix) \
1408  inline _Tpvec v_sqrt(const _Tpvec& x) \
1409  { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
1410  inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1411  { return v_fma(a, a, b * b); } \
1412  inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1413  { return v_sqrt(v_fma(a, a, b*b)); }
1414 
1415 OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
1416 OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
1417 OPENCV_HAL_IMPL_AVX_MISC(v_float32x8, ps)
1418 OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)
1419 
1420 inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
1421 {
1422  return a * b + c;
1423 }
1424 
1425 inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
1426 {
1427  return v_fma(a, b, c);
1428 }
1429 
1430 inline v_float32x8 v_invsqrt(const v_float32x8& x)
1431 {
1432  v_float32x8 half = x * v256_setall_f32(0.5);
1433  v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val));
1434  // todo: _mm256_fnmsub_ps
1435  t *= v256_setall_f32(1.5) - ((t * t) * half);
1436  return t;
1437 }
1438 
1439 inline v_float64x4 v_invsqrt(const v_float64x4& x)
1440 {
1441  return v256_setall_f64(1.) / v_sqrt(x);
1442 }
1443 
1445 #define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix) \
1446  inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1447  { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
1448 
1449 OPENCV_HAL_IMPL_AVX_ABS(int8x32, epi8)
1450 OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
1451 OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
1452 
1453 inline v_float32x8 v_abs(const v_float32x8& x)
1454 { return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
1455 inline v_float64x4 v_abs(const v_float64x4& x)
1456 { return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
1457 
1459 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
1460 { return v_add_wrap(a - b, b - a); }
1461 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
1462 { return v_add_wrap(a - b, b - a); }
1463 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
1464 { return v_max(a, b) - v_min(a, b); }
1465 
1466 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
1467 {
1468  v_int8x32 d = v_sub_wrap(a, b);
1469  v_int8x32 m = a < b;
1470  return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1471 }
1472 
1473 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
1474 { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1475 
1476 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
1477 {
1478  v_int32x8 d = a - b;
1479  v_int32x8 m = a < b;
1480  return v_reinterpret_as_u32((d ^ m) - m);
1481 }
1482 
1483 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
1484 { return v_abs(a - b); }
1485 
1486 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
1487 { return v_abs(a - b); }
1488 
1490 inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
1491 {
1492  v_int8x32 d = a - b;
1493  v_int8x32 m = a < b;
1494  return (d ^ m) - m;
1495 }
1496 inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
1497 { return v_max(a, b) - v_min(a, b); }
1498 
1500 
1502 inline v_int32x8 v_round(const v_float32x8& a)
1503 { return v_int32x8(_mm256_cvtps_epi32(a.val)); }
1504 
1505 inline v_int32x8 v_round(const v_float64x4& a)
1506 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
1507 
1508 inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
1509 {
1510  __m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
1511  return v_int32x8(_v256_combine(ai, bi));
1512 }
1513 
1514 inline v_int32x8 v_trunc(const v_float32x8& a)
1515 { return v_int32x8(_mm256_cvttps_epi32(a.val)); }
1516 
1517 inline v_int32x8 v_trunc(const v_float64x4& a)
1518 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
1519 
1520 inline v_int32x8 v_floor(const v_float32x8& a)
1521 { return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
1522 
1523 inline v_int32x8 v_floor(const v_float64x4& a)
1524 { return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
1525 
1526 inline v_int32x8 v_ceil(const v_float32x8& a)
1527 { return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
1528 
1529 inline v_int32x8 v_ceil(const v_float64x4& a)
1530 { return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
1531 
1533 inline v_float32x8 v_cvt_f32(const v_int32x8& a)
1534 { return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
1535 
1536 inline v_float32x8 v_cvt_f32(const v_float64x4& a)
1537 { return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
1538 
1539 inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
1540 {
1541  __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
1542  return v_float32x8(_v256_combine(af, bf));
1543 }
1544 
1545 inline v_float64x4 v_cvt_f64(const v_int32x8& a)
1546 { return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
1547 
1548 inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
1549 { return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
1550 
1551 inline v_float64x4 v_cvt_f64(const v_float32x8& a)
1552 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
1553 
1554 inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
1555 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
1556 
1557 // from (Mysticial and wim) https://stackoverflow.com/q/41144668
1558 inline v_float64x4 v_cvt_f64(const v_int64x4& v)
1559 {
1560  // constants encoded as floating-point
1561  __m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); // 2^52
1562  __m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
1563  __m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
1564  __m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
1565 
1566  // Blend the 32 lowest significant bits of v with magic_int_lo
1567  __m256i v_lo = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
1568  // Extract the 32 most significant bits of v
1569  __m256i v_hi = _mm256_srli_epi64(v.val, 32);
1570  // Flip the msb of v_hi and blend with 0x45300000
1571  v_hi = _mm256_xor_si256(v_hi, magic_i_hi32);
1572  // Compute in double precision
1573  __m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
1574  // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
1575  __m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
1576  return v_float64x4(result);
1577 }
1578 
1580 
1581 inline v_int8x32 v256_lut(const schar* tab, const int* idx)
1582 {
1583  return v_int8x32(_mm256_setr_epi8(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
1584  tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]],
1585  tab[idx[16]], tab[idx[17]], tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
1586  tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], tab[idx[30]], tab[idx[31]]));
1587 }
1588 inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
1589 {
1590  return v_int8x32(_mm256_setr_epi16(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]), *(const short*)(tab + idx[ 3]),
1591  *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]), *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]),
1592  *(const short*)(tab + idx[ 8]), *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
1593  *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]), *(const short*)(tab + idx[15])));
1594 }
1595 inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
1596 {
1597  return v_int8x32(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 1));
1598 }
1599 inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
1600 inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
1601 inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
1602 
1603 inline v_int16x16 v256_lut(const short* tab, const int* idx)
1604 {
1605  return v_int16x16(_mm256_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
1606  tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
1607 }
1608 inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
1609 {
1610  return v_int16x16(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 2));
1611 }
1612 inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
1613 {
1614 #if defined(__GNUC__)
1615  return v_int16x16(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 2));//Looks like intrinsic has wrong definition
1616 #else
1617  return v_int16x16(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 2));
1618 #endif
1619 }
1620 inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
1621 inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
1622 inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
1623 
1624 inline v_int32x8 v256_lut(const int* tab, const int* idx)
1625 {
1626  return v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
1627 }
1628 inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
1629 {
1630 #if defined(__GNUC__)
1631  return v_int32x8(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
1632 #else
1633  return v_int32x8(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
1634 #endif
1635 }
1636 inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
1637 {
1638  return v_int32x8(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
1639 }
1640 inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
1641 inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
1642 inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
1643 
1644 inline v_int64x4 v256_lut(const int64* tab, const int* idx)
1645 {
1646 #if defined(__GNUC__)
1647  return v_int64x4(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 8));
1648 #else
1649  return v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((const __m128i*)idx), 8));
1650 #endif
1651 }
1652 inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
1653 {
1654  return v_int64x4(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
1655 }
1656 inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
1657 inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
1658 
1659 inline v_float32x8 v256_lut(const float* tab, const int* idx)
1660 {
1661  return v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
1662 }
1663 inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
1664 inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
1665 
1666 inline v_float64x4 v256_lut(const double* tab, const int* idx)
1667 {
1668  return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8));
1669 }
1670 inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); }
1671 
1672 inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
1673 {
1674  return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));
1675 }
1676 
1677 inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
1678 {
1679  return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
1680 }
1681 
1682 inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
1683 {
1684  return v_float32x8(_mm256_i32gather_ps(tab, idxvec.val, 4));
1685 }
1686 
1687 inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
1688 {
1689  return v_float64x4(_mm256_i32gather_pd(tab, _mm256_castsi256_si128(idxvec.val), 8));
1690 }
1691 
1692 inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
1693 {
1694  int CV_DECL_ALIGNED(32) idx[8];
1695  v_store_aligned(idx, idxvec);
1696  __m128 z = _mm_setzero_ps();
1697  __m128 xy01, xy45, xy23, xy67;
1698  xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
1699  xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
1700  xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
1701  xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
1702  __m256 xy0145 = _v256_combine(xy01, xy45);
1703  xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
1704  xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
1705  xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
1706  xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
1707  __m256 xy2367 = _v256_combine(xy23, xy67);
1708 
1709  __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
1710  __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
1711 
1712  x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
1713  y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
1714 }
1715 
1716 inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
1717 {
1718  int CV_DECL_ALIGNED(32) idx[4];
1719  v_store_low(idx, idxvec);
1720  __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
1721  __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
1722  __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
1723  __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
1724  __m256d xy02 = _v256_combine(xy0, xy2);
1725  __m256d xy13 = _v256_combine(xy1, xy3);
1726 
1727  x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
1728  y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
1729 }
1730 
1731 inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
1732 {
1733  return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
1734 }
1735 inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1736 inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
1737 {
1738  return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
1739 }
1740 inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1741 
1742 inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
1743 {
1744  return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
1745 }
1746 inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1747 inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
1748 {
1749  return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
1750 }
1751 inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1752 
1753 inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
1754 {
1755  return v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
1756 }
1757 inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1758 inline v_float32x8 v_interleave_pairs(const v_float32x8& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1759 
1760 inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
1761 {
1762  return v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))),
1763  _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1764 }
1765 inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1766 
1767 inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
1768 {
1769  return v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))),
1770  _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1771 }
1772 inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1773 
1774 inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
1775 {
1776  return v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1777 }
1778 inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
1779 inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
1780 {
1781  return v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1782 }
1783 
1785 
1787 
1788 // 16 >> 32
1789 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
1790 { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
1791 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1792 { return v_dotprod(a, b) + c; }
1793 
1794 // 32 >> 64
1795 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
1796 {
1797  __m256i even = _mm256_mul_epi32(a.val, b.val);
1798  __m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
1799  return v_int64x4(_mm256_add_epi64(even, odd));
1800 }
1801 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
1802 { return v_dotprod(a, b) + c; }
1803 
1804 // 8 >> 32
1805 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
1806 {
1807  __m256i even_m = _mm256_set1_epi32(0xFF00FF00);
1808  __m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
1809  __m256i odd_a = _mm256_srli_epi16(a.val, 8);
1810 
1811  __m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
1812  __m256i odd_b = _mm256_srli_epi16(b.val, 8);
1813 
1814  __m256i prod0 = _mm256_madd_epi16(even_a, even_b);
1815  __m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
1816  return v_uint32x8(_mm256_add_epi32(prod0, prod1));
1817 }
1818 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
1819 { return v_dotprod_expand(a, b) + c; }
1820 
1821 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
1822 {
1823  __m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
1824  __m256i odd_a = _mm256_srai_epi16(a.val, 8);
1825 
1826  __m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
1827  __m256i odd_b = _mm256_srai_epi16(b.val, 8);
1828 
1829  __m256i prod0 = _mm256_madd_epi16(even_a, even_b);
1830  __m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
1831  return v_int32x8(_mm256_add_epi32(prod0, prod1));
1832 }
1833 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
1834 { return v_dotprod_expand(a, b) + c; }
1835 
1836 // 16 >> 64
1837 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
1838 {
1839  __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
1840  __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
1841  __m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
1842  __m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
1843 
1844  __m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
1845  __m256i p13 = _mm256_srli_epi64(mul0, 32);
1846  __m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
1847  __m256i p57 = _mm256_srli_epi64(mul1, 32);
1848 
1849  __m256i p15_ = _mm256_add_epi64(p02, p13);
1850  __m256i p9d_ = _mm256_add_epi64(p46, p57);
1851 
1852  return v_uint64x4(_mm256_add_epi64(
1853  _mm256_unpacklo_epi64(p15_, p9d_),
1854  _mm256_unpackhi_epi64(p15_, p9d_)
1855  ));
1856 }
1857 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
1858 { return v_dotprod_expand(a, b) + c; }
1859 
1860 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
1861 {
1862  __m256i prod = _mm256_madd_epi16(a.val, b.val);
1863  __m256i sign = _mm256_srai_epi32(prod, 31);
1864 
1865  __m256i lo = _mm256_unpacklo_epi32(prod, sign);
1866  __m256i hi = _mm256_unpackhi_epi32(prod, sign);
1867 
1868  return v_int64x4(_mm256_add_epi64(
1869  _mm256_unpacklo_epi64(lo, hi),
1870  _mm256_unpackhi_epi64(lo, hi)
1871  ));
1872 }
1873 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
1874 { return v_dotprod_expand(a, b) + c; }
1875 
1876 // 32 >> 64f
1877 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
1878 { return v_cvt_f64(v_dotprod(a, b)); }
1879 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
1880 { return v_dotprod_expand(a, b) + c; }
1881 
1883 
1884 // 16 >> 32
1885 inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
1886 { return v_dotprod(a, b); }
1887 inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1888 { return v_dotprod(a, b, c); }
1889 
1890 // 32 >> 64
1891 inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
1892 { return v_dotprod(a, b); }
1893 inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
1894 { return v_dotprod(a, b, c); }
1895 
1896 // 8 >> 32
1897 inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
1898 { return v_dotprod_expand(a, b); }
1899 inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
1900 { return v_dotprod_expand(a, b, c); }
1901 
1902 inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
1903 { return v_dotprod_expand(a, b); }
1904 inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
1905 { return v_dotprod_expand(a, b, c); }
1906 
1907 // 16 >> 64
1908 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
1909 {
1910  __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
1911  __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
1912  __m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
1913  __m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
1914 
1915  __m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
1916  __m256i p13 = _mm256_srli_epi64(mul0, 32);
1917  __m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
1918  __m256i p57 = _mm256_srli_epi64(mul1, 32);
1919 
1920  __m256i p15_ = _mm256_add_epi64(p02, p13);
1921  __m256i p9d_ = _mm256_add_epi64(p46, p57);
1922 
1923  return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
1924 }
1925 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
1926 { return v_dotprod_expand_fast(a, b) + c; }
1927 
1928 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
1929 {
1930  __m256i prod = _mm256_madd_epi16(a.val, b.val);
1931  __m256i sign = _mm256_srai_epi32(prod, 31);
1932  __m256i lo = _mm256_unpacklo_epi32(prod, sign);
1933  __m256i hi = _mm256_unpackhi_epi32(prod, sign);
1934  return v_int64x4(_mm256_add_epi64(lo, hi));
1935 }
1936 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
1937 { return v_dotprod_expand_fast(a, b) + c; }
1938 
1939 // 32 >> 64f
1940 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
1941 { return v_dotprod_expand(a, b); }
1942 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
1943 { return v_dotprod_expand(a, b, c); }
1944 
1945 #define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
1946  v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1947 
1948 inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
1949  const v_float32x8& m1, const v_float32x8& m2,
1950  const v_float32x8& m3)
1951 {
1952  v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1953  v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1954  v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1955  v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
1956  return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
1957 }
1958 
1959 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
1960  const v_float32x8& m1, const v_float32x8& m2,
1961  const v_float32x8& a)
1962 {
1963  v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1964  v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1965  v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1966  return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
1967 }
1968 
1969 #define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1970  inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1971  const _Tpvec& a2, const _Tpvec& a3, \
1972  _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1973  { \
1974  __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val)); \
1975  __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val)); \
1976  __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val)); \
1977  __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val)); \
1978  b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1)); \
1979  b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1)); \
1980  b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3)); \
1981  b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3)); \
1982  }
1983 
1984 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1985 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1986 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
1987 
1988 
1990 /* Expand */
1991 #define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1992  inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1993  { \
1994  b0.val = intrin(_v256_extract_low(a.val)); \
1995  b1.val = intrin(_v256_extract_high(a.val)); \
1996  } \
1997  inline _Tpwvec v_expand_low(const _Tpvec& a) \
1998  { return _Tpwvec(intrin(_v256_extract_low(a.val))); } \
1999  inline _Tpwvec v_expand_high(const _Tpvec& a) \
2000  { return _Tpwvec(intrin(_v256_extract_high(a.val))); } \
2001  inline _Tpwvec v256_load_expand(const _Tp* ptr) \
2002  { \
2003  __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
2004  return _Tpwvec(intrin(a)); \
2005  }
2006 
2007 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32, v_uint16x16, uchar, _mm256_cvtepu8_epi16)
2008 OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32, v_int16x16, schar, _mm256_cvtepi8_epi16)
2009 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8, ushort, _mm256_cvtepu16_epi32)
2010 OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16, v_int32x8, short, _mm256_cvtepi16_epi32)
2011 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4, unsigned, _mm256_cvtepu32_epi64)
2012 OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4, int, _mm256_cvtepi32_epi64)
2013 
2014 #define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin) \
2015  inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
2016  { \
2017  __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
2018  return _Tpvec(intrin(a)); \
2019  }
2020 
2021 OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
2022 OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8, schar, _mm256_cvtepi8_epi32)
2023 
2024 /* pack */
2025 // 16
2026 inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
2027 { return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
2028 
2029 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
2030 {
2031  __m256i t = _mm256_set1_epi16(255);
2032  __m256i a1 = _mm256_min_epu16(a.val, t);
2033  __m256i b1 = _mm256_min_epu16(b.val, t);
2034  return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
2035 }
2036 
2037 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
2038 {
2039  return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
2040 }
2041 
2042 inline void v_pack_store(schar* ptr, const v_int16x16& a)
2043 { v_store_low(ptr, v_pack(a, a)); }
2044 
2045 inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
2046 {
2047  const __m256i m = _mm256_set1_epi16(255);
2048  __m256i am = _mm256_min_epu16(a.val, m);
2049  am = _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
2050  v_store_low(ptr, v_uint8x32(am));
2051 }
2052 
2053 inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
2054 { v_store_low(ptr, v_pack_u(a, a)); }
2055 
2056 template<int n> inline
2057 v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
2058 {
2059  // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
2060  v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
2061  return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
2062  v_reinterpret_as_s16((b + delta) >> n));
2063 }
2064 
2065 template<int n> inline
2066 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
2067 {
2068  v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
2069  v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
2070 }
2071 
2072 template<int n> inline
2073 v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
2074 {
2075  v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2076  return v_pack_u((a + delta) >> n, (b + delta) >> n);
2077 }
2078 
2079 template<int n> inline
2080 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
2081 {
2082  v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2083  v_pack_u_store(ptr, (a + delta) >> n);
2084 }
2085 
2086 template<int n> inline
2087 v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
2088 {
2089  v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2090  return v_pack((a + delta) >> n, (b + delta) >> n);
2091 }
2092 
2093 template<int n> inline
2094 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
2095 {
2096  v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2097  v_pack_store(ptr, (a + delta) >> n);
2098 }
2099 
2100 // 32
2101 inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
2102 { return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
2103 
2104 inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
2105 { return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
2106 
2107 inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
2108 { return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
2109 
2110 inline void v_pack_store(short* ptr, const v_int32x8& a)
2111 { v_store_low(ptr, v_pack(a, a)); }
2112 
2113 inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
2114 {
2115  const __m256i m = _mm256_set1_epi32(65535);
2116  __m256i am = _mm256_min_epu32(a.val, m);
2117  am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
2118  v_store_low(ptr, v_uint16x16(am));
2119 }
2120 
2121 inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
2122 { v_store_low(ptr, v_pack_u(a, a)); }
2123 
2124 
2125 template<int n> inline
2126 v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
2127 {
2128  // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
2129  v_uint32x8 delta = v256_setall_u32(1 << (n-1));
2130  return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
2131  v_reinterpret_as_s32((b + delta) >> n));
2132 }
2133 
2134 template<int n> inline
2135 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
2136 {
2137  v_uint32x8 delta = v256_setall_u32(1 << (n-1));
2138  v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
2139 }
2140 
2141 template<int n> inline
2142 v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
2143 {
2144  v_int32x8 delta = v256_setall_s32(1 << (n-1));
2145  return v_pack_u((a + delta) >> n, (b + delta) >> n);
2146 }
2147 
2148 template<int n> inline
2149 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
2150 {
2151  v_int32x8 delta = v256_setall_s32(1 << (n-1));
2152  v_pack_u_store(ptr, (a + delta) >> n);
2153 }
2154 
2155 template<int n> inline
2156 v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
2157 {
2158  v_int32x8 delta = v256_setall_s32(1 << (n-1));
2159  return v_pack((a + delta) >> n, (b + delta) >> n);
2160 }
2161 
2162 template<int n> inline
2163 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
2164 {
2165  v_int32x8 delta = v256_setall_s32(1 << (n-1));
2166  v_pack_store(ptr, (a + delta) >> n);
2167 }
2168 
2169 // 64
2170 // Non-saturating pack
2171 inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
2172 {
2173  __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
2174  __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
2175  __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
2176  return v_uint32x8(_v256_shuffle_odd_64(ab));
2177 }
2178 
2179 inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
2180 { return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2181 
2182 inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
2183 {
2184  __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
2185  v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
2186 }
2187 
2188 inline void v_pack_store(int* ptr, const v_int64x4& b)
2189 { v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
2190 
2191 template<int n> inline
2192 v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
2193 {
2194  v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
2195  return v_pack((a + delta) >> n, (b + delta) >> n);
2196 }
2197 
2198 template<int n> inline
2199 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
2200 {
2201  v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
2202  v_pack_store(ptr, (a + delta) >> n);
2203 }
2204 
2205 template<int n> inline
2206 v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
2207 {
2208  v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
2209  return v_pack((a + delta) >> n, (b + delta) >> n);
2210 }
2211 
2212 template<int n> inline
2213 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
2214 {
2215  v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
2216  v_pack_store(ptr, (a + delta) >> n);
2217 }
2218 
2219 // pack boolean
2220 inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
2221 {
2222  __m256i ab = _mm256_packs_epi16(a.val, b.val);
2223  return v_uint8x32(_v256_shuffle_odd_64(ab));
2224 }
2225 
2226 inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
2227  const v_uint32x8& c, const v_uint32x8& d)
2228 {
2229  __m256i ab = _mm256_packs_epi32(a.val, b.val);
2230  __m256i cd = _mm256_packs_epi32(c.val, d.val);
2231 
2232  __m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
2233  return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
2234 }
2235 
2236 inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
2237  const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
2238  const v_uint64x4& g, const v_uint64x4& h)
2239 {
2240  __m256i ab = _mm256_packs_epi32(a.val, b.val);
2241  __m256i cd = _mm256_packs_epi32(c.val, d.val);
2242  __m256i ef = _mm256_packs_epi32(e.val, f.val);
2243  __m256i gh = _mm256_packs_epi32(g.val, h.val);
2244 
2245  __m256i abcd = _mm256_packs_epi32(ab, cd);
2246  __m256i efgh = _mm256_packs_epi32(ef, gh);
2247  __m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
2248 
2249  __m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
2250  return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
2251 }
2252 
2253 /* Recombine */
2254 // its up there with load and store operations
2255 
2256 /* Extract */
2257 #define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec) \
2258  template<int s> \
2259  inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2260  { return v_rotate_right<s>(a, b); }
2261 
2262 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
2263 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
2264 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
2265 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
2266 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
2267 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
2268 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
2269 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
2270 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
2271 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
2272 
2273 template<int i>
2274 inline uchar v_extract_n(v_uint8x32 a)
2275 {
2276  return (uchar)_v256_extract_epi8<i>(a.val);
2277 }
2278 
2279 template<int i>
2280 inline schar v_extract_n(v_int8x32 a)
2281 {
2282  return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
2283 }
2284 
2285 template<int i>
2286 inline ushort v_extract_n(v_uint16x16 a)
2287 {
2288  return (ushort)_v256_extract_epi16<i>(a.val);
2289 }
2290 
2291 template<int i>
2292 inline short v_extract_n(v_int16x16 a)
2293 {
2294  return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
2295 }
2296 
2297 template<int i>
2298 inline uint v_extract_n(v_uint32x8 a)
2299 {
2300  return (uint)_v256_extract_epi32<i>(a.val);
2301 }
2302 
2303 template<int i>
2304 inline int v_extract_n(v_int32x8 a)
2305 {
2306  return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
2307 }
2308 
2309 template<int i>
2310 inline uint64 v_extract_n(v_uint64x4 a)
2311 {
2312  return (uint64)_v256_extract_epi64<i>(a.val);
2313 }
2314 
2315 template<int i>
2316 inline int64 v_extract_n(v_int64x4 v)
2317 {
2318  return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
2319 }
2320 
2321 template<int i>
2322 inline float v_extract_n(v_float32x8 v)
2323 {
2324  union { uint iv; float fv; } d;
2325  d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
2326  return d.fv;
2327 }
2328 
2329 template<int i>
2330 inline double v_extract_n(v_float64x4 v)
2331 {
2332  union { uint64 iv; double dv; } d;
2333  d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
2334  return d.dv;
2335 }
2336 
2337 template<int i>
2338 inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
2339 {
2340  static const __m256i perm = _mm256_set1_epi32((char)i);
2341  return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
2342 }
2343 
2344 template<int i>
2345 inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
2346 { return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2347 
2348 template<int i>
2349 inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
2350 { return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2351 
2352 
2354 
2355 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
2356 {
2357  __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2358  __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2359 
2360  const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
2361  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
2362  __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
2363  __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
2364  __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2365  __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2366  __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2367  __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2368  a = v_uint8x32(a0);
2369  b = v_uint8x32(b0);
2370 }
2371 
2372 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
2373 {
2374  __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2375  __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2376 
2377  const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2378  0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
2379  __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
2380  __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
2381  __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2382  __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2383  __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2384  __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2385  a = v_uint16x16(a0);
2386  b = v_uint16x16(b0);
2387 }
2388 
2389 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
2390 {
2391  __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2392  __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2393 
2394  enum { sh = 0+2*4+1*16+3*64 };
2395  __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
2396  __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
2397  __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2398  __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2399  __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2400  __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2401  a = v_uint32x8(a0);
2402  b = v_uint32x8(b0);
2403 }
2404 
2405 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
2406 {
2407  __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2408  __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
2409 
2410  __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
2411  __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
2412  __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2413  __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2414  a = v_uint64x4(a0);
2415  b = v_uint64x4(b0);
2416 }
2417 
2418 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
2419 {
2420  __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2421  __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2422  __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
2423 
2424  __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2425  __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2426 
2427  const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2428  0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2429  const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2430  -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
2431 
2432  __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
2433  __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
2434  __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
2435 
2436  const __m256i
2437  sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
2438  0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
2439  sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
2440  1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
2441  sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
2442  2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2443  b0 = _mm256_shuffle_epi8(b0, sh_b);
2444  g0 = _mm256_shuffle_epi8(g0, sh_g);
2445  r0 = _mm256_shuffle_epi8(r0, sh_r);
2446 
2447  a = v_uint8x32(b0);
2448  b = v_uint8x32(g0);
2449  c = v_uint8x32(r0);
2450 }
2451 
2452 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
2453 {
2454  __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2455  __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2456  __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2457 
2458  __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2459  __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2460 
2461  const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2462  0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2463  const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2464  -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2465  __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
2466  __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
2467  __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
2468  const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2469  0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2470  const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2471  2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2472  const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2473  4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2474  b0 = _mm256_shuffle_epi8(b0, sh_b);
2475  g0 = _mm256_shuffle_epi8(g0, sh_g);
2476  r0 = _mm256_shuffle_epi8(r0, sh_r);
2477 
2478  a = v_uint16x16(b0);
2479  b = v_uint16x16(g0);
2480  c = v_uint16x16(r0);
2481 }
2482 
2483 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
2484 {
2485  __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2486  __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2487  __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2488 
2489  __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2490  __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2491 
2492  __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
2493  __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
2494  __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
2495 
2496  b0 = _mm256_shuffle_epi32(b0, 0x6c);
2497  g0 = _mm256_shuffle_epi32(g0, 0xb1);
2498  r0 = _mm256_shuffle_epi32(r0, 0xc6);
2499 
2500  a = v_uint32x8(b0);
2501  b = v_uint32x8(g0);
2502  c = v_uint32x8(r0);
2503 }
2504 
2505 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
2506 {
2507  __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2508  __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
2509  __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2510 
2511  __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
2512  __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
2513  __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
2514  __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
2515  __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
2516  __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
2517 
2518  a = v_uint64x4(b0);
2519  b = v_uint64x4(g0);
2520  c = v_uint64x4(r0);
2521 }
2522 
2523 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )
2524 {
2525  __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2526  __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2527  __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
2528  __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
2529  const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
2530  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
2531 
2532  __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
2533  __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
2534  __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
2535  __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
2536 
2537  __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2538  __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2539  __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2540  __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2541 
2542  __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2543  __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2544  __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2545  __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2546 
2547  __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2548  __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2549  __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2550  __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2551 
2552  a = v_uint8x32(b0);
2553  b = v_uint8x32(g0);
2554  c = v_uint8x32(r0);
2555  d = v_uint8x32(a0);
2556 }
2557 
2558 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )
2559 {
2560  __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2561  __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2562  __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2563  __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
2564  const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
2565  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
2566  __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
2567  __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
2568  __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
2569  __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
2570 
2571  __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2572  __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2573  __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2574  __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2575 
2576  __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2577  __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2578  __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2579  __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2580 
2581  __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2582  __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2583  __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2584  __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2585 
2586  a = v_uint16x16(b0);
2587  b = v_uint16x16(g0);
2588  c = v_uint16x16(r0);
2589  d = v_uint16x16(a0);
2590 }
2591 
2592 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
2593 {
2594  __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
2595  __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2596  __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2597  __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));
2598 
2599  __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2600  __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2601  __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2602  __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2603 
2604  __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2605  __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2606  __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2607  __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2608 
2609  __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2610  __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2611  __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2612  __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2613 
2614  a = v_uint32x8(b0);
2615  b = v_uint32x8(g0);
2616  c = v_uint32x8(r0);
2617  d = v_uint32x8(a0);
2618 }
2619 
2620 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
2621 {
2622  __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
2623  __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
2624  __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2625  __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));
2626 
2627  __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
2628  __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
2629  __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
2630  __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
2631 
2632  __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
2633  __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
2634  __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
2635  __m256i a0 = _mm256_unpackhi_epi64(h02, h13);
2636 
2637  a = v_uint64x4(b0);
2638  b = v_uint64x4(g0);
2639  c = v_uint64x4(r0);
2640  d = v_uint64x4(a0);
2641 }
2642 
2644 
2645 inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
2647 {
2648  __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
2649  __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
2650 
2651  __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2652  __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2653 
2654  if( mode == hal::STORE_ALIGNED_NOCACHE )
2655  {
2656  _mm256_stream_si256((__m256i*)ptr, xy0);
2657  _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
2658  }
2659  else if( mode == hal::STORE_ALIGNED )
2660  {
2661  _mm256_store_si256((__m256i*)ptr, xy0);
2662  _mm256_store_si256((__m256i*)(ptr + 32), xy1);
2663  }
2664  else
2665  {
2666  _mm256_storeu_si256((__m256i*)ptr, xy0);
2667  _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
2668  }
2669 }
2670 
2671 inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
2673 {
2674  __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
2675  __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
2676 
2677  __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2678  __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2679 
2680  if( mode == hal::STORE_ALIGNED_NOCACHE )
2681  {
2682  _mm256_stream_si256((__m256i*)ptr, xy0);
2683  _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
2684  }
2685  else if( mode == hal::STORE_ALIGNED )
2686  {
2687  _mm256_store_si256((__m256i*)ptr, xy0);
2688  _mm256_store_si256((__m256i*)(ptr + 16), xy1);
2689  }
2690  else
2691  {
2692  _mm256_storeu_si256((__m256i*)ptr, xy0);
2693  _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
2694  }
2695 }
2696 
2697 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
2699 {
2700  __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
2701  __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
2702 
2703  __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2704  __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2705 
2706  if( mode == hal::STORE_ALIGNED_NOCACHE )
2707  {
2708  _mm256_stream_si256((__m256i*)ptr, xy0);
2709  _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
2710  }
2711  else if( mode == hal::STORE_ALIGNED )
2712  {
2713  _mm256_store_si256((__m256i*)ptr, xy0);
2714  _mm256_store_si256((__m256i*)(ptr + 8), xy1);
2715  }
2716  else
2717  {
2718  _mm256_storeu_si256((__m256i*)ptr, xy0);
2719  _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
2720  }
2721 }
2722 
2723 inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
2725 {
2726  __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
2727  __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
2728 
2729  __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2730  __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2731 
2732  if( mode == hal::STORE_ALIGNED_NOCACHE )
2733  {
2734  _mm256_stream_si256((__m256i*)ptr, xy0);
2735  _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
2736  }
2737  else if( mode == hal::STORE_ALIGNED )
2738  {
2739  _mm256_store_si256((__m256i*)ptr, xy0);
2740  _mm256_store_si256((__m256i*)(ptr + 4), xy1);
2741  }
2742  else
2743  {
2744  _mm256_storeu_si256((__m256i*)ptr, xy0);
2745  _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
2746  }
2747 }
2748 
2749 inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
2751 {
2752  const __m256i sh_b = _mm256_setr_epi8(
2753  0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2754  0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2755  const __m256i sh_g = _mm256_setr_epi8(
2756  5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2757  5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2758  const __m256i sh_r = _mm256_setr_epi8(
2759  10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2760  10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2761 
2762  __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
2763  __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
2764  __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
2765 
2766  const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2767  0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2768  const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2769  0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2770 
2771  __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2772  __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2773  __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2774 
2775  __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2776  __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
2777  __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
2778 
2779  if( mode == hal::STORE_ALIGNED_NOCACHE )
2780  {
2781  _mm256_stream_si256((__m256i*)ptr, bgr0);
2782  _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
2783  _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
2784  }
2785  else if( mode == hal::STORE_ALIGNED )
2786  {
2787  _mm256_store_si256((__m256i*)ptr, bgr0);
2788  _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
2789  _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
2790  }
2791  else
2792  {
2793  _mm256_storeu_si256((__m256i*)ptr, bgr0);
2794  _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
2795  _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
2796  }
2797 }
2798 
2799 inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
2801 {
2802  const __m256i sh_b = _mm256_setr_epi8(
2803  0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2804  0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2805  const __m256i sh_g = _mm256_setr_epi8(
2806  10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2807  10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2808  const __m256i sh_r = _mm256_setr_epi8(
2809  4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2810  4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2811 
2812  __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
2813  __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
2814  __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
2815 
2816  const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2817  0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2818  const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2819  -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2820 
2821  __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2822  __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2823  __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2824 
2825  __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
2826  //__m256i bgr1 = p1;
2827  __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
2828 
2829  if( mode == hal::STORE_ALIGNED_NOCACHE )
2830  {
2831  _mm256_stream_si256((__m256i*)ptr, bgr0);
2832  _mm256_stream_si256((__m256i*)(ptr + 16), p1);
2833  _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
2834  }
2835  else if( mode == hal::STORE_ALIGNED )
2836  {
2837  _mm256_store_si256((__m256i*)ptr, bgr0);
2838  _mm256_store_si256((__m256i*)(ptr + 16), p1);
2839  _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
2840  }
2841  else
2842  {
2843  _mm256_storeu_si256((__m256i*)ptr, bgr0);
2844  _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
2845  _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
2846  }
2847 }
2848 
2849 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
2851 {
2852  __m256i b0 = _mm256_shuffle_epi32(a.val, 0x6c);
2853  __m256i g0 = _mm256_shuffle_epi32(b.val, 0xb1);
2854  __m256i r0 = _mm256_shuffle_epi32(c.val, 0xc6);
2855 
2856  __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
2857  __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
2858  __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
2859 
2860  __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2861  //__m256i bgr1 = p2;
2862  __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2863 
2864  if( mode == hal::STORE_ALIGNED_NOCACHE )
2865  {
2866  _mm256_stream_si256((__m256i*)ptr, bgr0);
2867  _mm256_stream_si256((__m256i*)(ptr + 8), p2);
2868  _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
2869  }
2870  else if( mode == hal::STORE_ALIGNED )
2871  {
2872  _mm256_store_si256((__m256i*)ptr, bgr0);
2873  _mm256_store_si256((__m256i*)(ptr + 8), p2);
2874  _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
2875  }
2876  else
2877  {
2878  _mm256_storeu_si256((__m256i*)ptr, bgr0);
2879  _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
2880  _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
2881  }
2882 }
2883 
2884 inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
2886 {
2887  __m256i s01 = _mm256_unpacklo_epi64(a.val, b.val);
2888  __m256i s12 = _mm256_unpackhi_epi64(b.val, c.val);
2889  __m256i s20 = _mm256_blend_epi32(c.val, a.val, 0xcc);
2890 
2891  __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
2892  __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
2893  __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
2894 
2895  if( mode == hal::STORE_ALIGNED_NOCACHE )
2896  {
2897  _mm256_stream_si256((__m256i*)ptr, bgr0);
2898  _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
2899  _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
2900  }
2901  else if( mode == hal::STORE_ALIGNED )
2902  {
2903  _mm256_store_si256((__m256i*)ptr, bgr0);
2904  _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
2905  _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
2906  }
2907  else
2908  {
2909  _mm256_storeu_si256((__m256i*)ptr, bgr0);
2910  _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
2911  _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
2912  }
2913 }
2914 
2915 inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
2916  const v_uint8x32& c, const v_uint8x32& d,
2918 {
2919  __m256i bg0 = _mm256_unpacklo_epi8(a.val, b.val);
2920  __m256i bg1 = _mm256_unpackhi_epi8(a.val, b.val);
2921  __m256i ra0 = _mm256_unpacklo_epi8(c.val, d.val);
2922  __m256i ra1 = _mm256_unpackhi_epi8(c.val, d.val);
2923 
2924  __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
2925  __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
2926  __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
2927  __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
2928 
2929  __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2930  __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2931  __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2932  __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2933 
2934  if( mode == hal::STORE_ALIGNED_NOCACHE )
2935  {
2936  _mm256_stream_si256((__m256i*)ptr, bgra0);
2937  _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
2938  _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
2939  _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
2940  }
2941  else if( mode == hal::STORE_ALIGNED )
2942  {
2943  _mm256_store_si256((__m256i*)ptr, bgra0);
2944  _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
2945  _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
2946  _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
2947  }
2948  else
2949  {
2950  _mm256_storeu_si256((__m256i*)ptr, bgra0);
2951  _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
2952  _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
2953  _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
2954  }
2955 }
2956 
2957 inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
2958  const v_uint16x16& c, const v_uint16x16& d,
2960 {
2961  __m256i bg0 = _mm256_unpacklo_epi16(a.val, b.val);
2962  __m256i bg1 = _mm256_unpackhi_epi16(a.val, b.val);
2963  __m256i ra0 = _mm256_unpacklo_epi16(c.val, d.val);
2964  __m256i ra1 = _mm256_unpackhi_epi16(c.val, d.val);
2965 
2966  __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
2967  __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
2968  __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
2969  __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
2970 
2971  __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2972  __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2973  __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2974  __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2975 
2976  if( mode == hal::STORE_ALIGNED_NOCACHE )
2977  {
2978  _mm256_stream_si256((__m256i*)ptr, bgra0);
2979  _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
2980  _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
2981  _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
2982  }
2983  else if( mode == hal::STORE_ALIGNED )
2984  {
2985  _mm256_store_si256((__m256i*)ptr, bgra0);
2986  _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
2987  _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
2988  _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
2989  }
2990  else
2991  {
2992  _mm256_storeu_si256((__m256i*)ptr, bgra0);
2993  _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
2994  _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
2995  _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
2996  }
2997 }
2998 
2999 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
3000  const v_uint32x8& c, const v_uint32x8& d,
3002 {
3003  __m256i bg0 = _mm256_unpacklo_epi32(a.val, b.val);
3004  __m256i bg1 = _mm256_unpackhi_epi32(a.val, b.val);
3005  __m256i ra0 = _mm256_unpacklo_epi32(c.val, d.val);
3006  __m256i ra1 = _mm256_unpackhi_epi32(c.val, d.val);
3007 
3008  __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
3009  __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
3010  __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
3011  __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
3012 
3013  __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
3014  __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
3015  __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
3016  __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
3017 
3018  if( mode == hal::STORE_ALIGNED_NOCACHE )
3019  {
3020  _mm256_stream_si256((__m256i*)ptr, bgra0);
3021  _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
3022  _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
3023  _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
3024  }
3025  else if( mode == hal::STORE_ALIGNED )
3026  {
3027  _mm256_store_si256((__m256i*)ptr, bgra0);
3028  _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
3029  _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
3030  _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
3031  }
3032  else
3033  {
3034  _mm256_storeu_si256((__m256i*)ptr, bgra0);
3035  _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
3036  _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
3037  _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
3038  }
3039 }
3040 
3041 inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
3042  const v_uint64x4& c, const v_uint64x4& d,
3044 {
3045  __m256i bg0 = _mm256_unpacklo_epi64(a.val, b.val);
3046  __m256i bg1 = _mm256_unpackhi_epi64(a.val, b.val);
3047  __m256i ra0 = _mm256_unpacklo_epi64(c.val, d.val);
3048  __m256i ra1 = _mm256_unpackhi_epi64(c.val, d.val);
3049 
3050  __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
3051  __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
3052  __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
3053  __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
3054 
3055  if( mode == hal::STORE_ALIGNED_NOCACHE )
3056  {
3057  _mm256_stream_si256((__m256i*)ptr, bgra0);
3058  _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
3059  _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
3060  _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
3061  }
3062  else if( mode == hal::STORE_ALIGNED )
3063  {
3064  _mm256_store_si256((__m256i*)ptr, bgra0);
3065  _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
3066  _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
3067  _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
3068  }
3069  else
3070  {
3071  _mm256_storeu_si256((__m256i*)ptr, bgra0);
3072  _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
3073  _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
3074  _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
3075  }
3076 }
3077 
3078 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
3079 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
3080 { \
3081  _Tpvec1 a1, b1; \
3082  v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
3083  a0 = v_reinterpret_as_##suffix0(a1); \
3084  b0 = v_reinterpret_as_##suffix0(b1); \
3085 } \
3086 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
3087 { \
3088  _Tpvec1 a1, b1, c1; \
3089  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
3090  a0 = v_reinterpret_as_##suffix0(a1); \
3091  b0 = v_reinterpret_as_##suffix0(b1); \
3092  c0 = v_reinterpret_as_##suffix0(c1); \
3093 } \
3094 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
3095 { \
3096  _Tpvec1 a1, b1, c1, d1; \
3097  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
3098  a0 = v_reinterpret_as_##suffix0(a1); \
3099  b0 = v_reinterpret_as_##suffix0(b1); \
3100  c0 = v_reinterpret_as_##suffix0(c1); \
3101  d0 = v_reinterpret_as_##suffix0(d1); \
3102 } \
3103 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3104  hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3105 { \
3106  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3107  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3108  v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
3109 } \
3110 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
3111  hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3112 { \
3113  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3114  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3115  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3116  v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
3117 } \
3118 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3119  const _Tpvec0& c0, const _Tpvec0& d0, \
3120  hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3121 { \
3122  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3123  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3124  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3125  _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
3126  v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3127 }
3128 
3129 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
3130 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
3131 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
3132 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
3133 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
3134 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
3135 
3136 //
3137 // FP16
3138 //
3139 
3140 inline v_float32x8 v256_load_expand(const hfloat* ptr)
3141 {
3142 #if CV_FP16
3143  return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
3144 #else
3145  float CV_DECL_ALIGNED(32) buf[8];
3146  for (int i = 0; i < 8; i++)
3147  buf[i] = (float)ptr[i];
3148  return v256_load_aligned(buf);
3149 #endif
3150 }
3151 
3152 inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
3153 {
3154 #if CV_FP16
3155  __m128i ah = _mm256_cvtps_ph(a.val, 0);
3156  _mm_storeu_si128((__m128i*)ptr, ah);
3157 #else
3158  float CV_DECL_ALIGNED(32) buf[8];
3159  v_store_aligned(buf, a);
3160  for (int i = 0; i < 8; i++)
3161  ptr[i] = hfloat(buf[i]);
3162 #endif
3163 }
3164 
3165 //
3166 // end of FP16
3167 //
3168 
3169 inline void v256_cleanup() { _mm256_zeroall(); }
3170 
3171 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3172 
3174 
3175 } // cv::
3176 
3177 #endif // OPENCV_HAL_INTRIN_AVX_HPP
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition: intrin_cpp.hpp:1515
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition: intrin_cpp.hpp:1554
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2216
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition: intrin_cpp.hpp:1496
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CvRect r
Definition: imgproc_c.h:984
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
StoreMode
Definition: intrin.hpp:100
@ STORE_ALIGNED_NOCACHE
Definition: intrin.hpp:103
@ STORE_ALIGNED
Definition: intrin.hpp:102
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
static CV__DEBUG_NS_BEGIN void swap(MatExpr &a, MatExpr &b)
Definition: mat.inl.hpp:3409
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition: dualquaternion.inl.hpp:274