EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_avx.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html
4
5#ifndef OPENCV_HAL_INTRIN_AVX_HPP
6#define OPENCV_HAL_INTRIN_AVX_HPP
7
8#define CV_SIMD256 1
9#define CV_SIMD256_64F 1
10#define CV_SIMD256_FP16 0 // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
11
12namespace cv
13{
14
16
17CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
18
20
21inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
22{ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
23
24inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
25{ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
26
27inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
28{ return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
29
30inline int _v_cvtsi256_si32(const __m256i& a)
31{ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
32
33inline __m256i _v256_shuffle_odd_64(const __m256i& v)
34{ return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
35
36inline __m256d _v256_shuffle_odd_64(const __m256d& v)
37{ return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
38
39template<int imm>
40inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
41{ return _mm256_permute2x128_si256(a, b, imm); }
42
43template<int imm>
44inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
45{ return _mm256_permute2f128_ps(a, b, imm); }
46
47template<int imm>
48inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
49{ return _mm256_permute2f128_pd(a, b, imm); }
50
51template<int imm, typename _Tpvec>
52inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
53{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
54
55template<int imm>
56inline __m256i _v256_permute4x64(const __m256i& a)
57{ return _mm256_permute4x64_epi64(a, imm); }
58
59template<int imm>
60inline __m256d _v256_permute4x64(const __m256d& a)
61{ return _mm256_permute4x64_pd(a, imm); }
62
63template<int imm, typename _Tpvec>
64inline _Tpvec v256_permute4x64(const _Tpvec& a)
65{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
66
67inline __m128i _v256_extract_high(const __m256i& v)
68{ return _mm256_extracti128_si256(v, 1); }
69
70inline __m128 _v256_extract_high(const __m256& v)
71{ return _mm256_extractf128_ps(v, 1); }
72
73inline __m128d _v256_extract_high(const __m256d& v)
74{ return _mm256_extractf128_pd(v, 1); }
75
76inline __m128i _v256_extract_low(const __m256i& v)
77{ return _mm256_castsi256_si128(v); }
78
79inline __m128 _v256_extract_low(const __m256& v)
80{ return _mm256_castps256_ps128(v); }
81
82inline __m128d _v256_extract_low(const __m256d& v)
83{ return _mm256_castpd256_pd128(v); }
84
85inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
86{
87 const __m256i m = _mm256_set1_epi32(65535);
88 __m256i am = _mm256_min_epu32(a, m);
89 __m256i bm = _mm256_min_epu32(b, m);
90 return _mm256_packus_epi32(am, bm);
91}
92
93template<int i>
94inline int _v256_extract_epi8(const __m256i& a)
95{
96#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
97 return _mm256_extract_epi8(a, i);
98#else
99 __m128i b = _mm256_extractf128_si256(a, ((i) >> 4));
100 return _mm_extract_epi8(b, i & 15); // SSE4.1
101#endif
102}
103
104template<int i>
105inline int _v256_extract_epi16(const __m256i& a)
106{
107#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
108 return _mm256_extract_epi16(a, i);
109#else
110 __m128i b = _mm256_extractf128_si256(a, ((i) >> 3));
111 return _mm_extract_epi16(b, i & 7); // SSE2
112#endif
113}
114
115template<int i>
116inline int _v256_extract_epi32(const __m256i& a)
117{
118#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
119 return _mm256_extract_epi32(a, i);
120#else
121 __m128i b = _mm256_extractf128_si256(a, ((i) >> 2));
122 return _mm_extract_epi32(b, i & 3); // SSE4.1
123#endif
124}
125
126template<int i>
127inline int64 _v256_extract_epi64(const __m256i& a)
128{
129#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
130 return _mm256_extract_epi64(a, i);
131#else
132 __m128i b = _mm256_extractf128_si256(a, ((i) >> 1));
133 return _mm_extract_epi64(b, i & 1); // SSE4.1
134#endif
135}
136
138
139struct v_uint8x32
140{
141 typedef uchar lane_type;
142 enum { nlanes = 32 };
143 __m256i val;
144
145 explicit v_uint8x32(__m256i v) : val(v) {}
146 v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3,
147 uchar v4, uchar v5, uchar v6, uchar v7,
148 uchar v8, uchar v9, uchar v10, uchar v11,
149 uchar v12, uchar v13, uchar v14, uchar v15,
150 uchar v16, uchar v17, uchar v18, uchar v19,
151 uchar v20, uchar v21, uchar v22, uchar v23,
152 uchar v24, uchar v25, uchar v26, uchar v27,
153 uchar v28, uchar v29, uchar v30, uchar v31)
154 {
155 val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
156 (char)v4, (char)v5, (char)v6 , (char)v7, (char)v8, (char)v9,
157 (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
158 (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
159 (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
160 (char)v28, (char)v29, (char)v30, (char)v31);
161 }
162 /* coverity[uninit_ctor]: suppress warning */
163 v_uint8x32() {}
164
165 uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
166};
167
168struct v_int8x32
169{
170 typedef schar lane_type;
171 enum { nlanes = 32 };
172 __m256i val;
173
174 explicit v_int8x32(__m256i v) : val(v) {}
175 v_int8x32(schar v0, schar v1, schar v2, schar v3,
176 schar v4, schar v5, schar v6, schar v7,
177 schar v8, schar v9, schar v10, schar v11,
178 schar v12, schar v13, schar v14, schar v15,
179 schar v16, schar v17, schar v18, schar v19,
180 schar v20, schar v21, schar v22, schar v23,
181 schar v24, schar v25, schar v26, schar v27,
182 schar v28, schar v29, schar v30, schar v31)
183 {
184 val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
185 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
186 v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
187 }
188 /* coverity[uninit_ctor]: suppress warning */
189 v_int8x32() {}
190
191 schar get0() const { return (schar)_v_cvtsi256_si32(val); }
192};
193
194struct v_uint16x16
195{
196 typedef ushort lane_type;
197 enum { nlanes = 16 };
198 __m256i val;
199
200 explicit v_uint16x16(__m256i v) : val(v) {}
201 v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3,
202 ushort v4, ushort v5, ushort v6, ushort v7,
203 ushort v8, ushort v9, ushort v10, ushort v11,
204 ushort v12, ushort v13, ushort v14, ushort v15)
205 {
206 val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
207 (short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9,
208 (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
209 }
210 /* coverity[uninit_ctor]: suppress warning */
211 v_uint16x16() {}
212
213 ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
214};
215
216struct v_int16x16
217{
218 typedef short lane_type;
219 enum { nlanes = 16 };
220 __m256i val;
221
222 explicit v_int16x16(__m256i v) : val(v) {}
223 v_int16x16(short v0, short v1, short v2, short v3,
224 short v4, short v5, short v6, short v7,
225 short v8, short v9, short v10, short v11,
226 short v12, short v13, short v14, short v15)
227 {
228 val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
229 v8, v9, v10, v11, v12, v13, v14, v15);
230 }
231 /* coverity[uninit_ctor]: suppress warning */
232 v_int16x16() {}
233
234 short get0() const { return (short)_v_cvtsi256_si32(val); }
235};
236
237struct v_uint32x8
238{
239 typedef unsigned lane_type;
240 enum { nlanes = 8 };
241 __m256i val;
242
243 explicit v_uint32x8(__m256i v) : val(v) {}
244 v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
245 unsigned v4, unsigned v5, unsigned v6, unsigned v7)
246 {
247 val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
248 (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
249 }
250 /* coverity[uninit_ctor]: suppress warning */
251 v_uint32x8() {}
252
253 unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
254};
255
256struct v_int32x8
257{
258 typedef int lane_type;
259 enum { nlanes = 8 };
260 __m256i val;
261
262 explicit v_int32x8(__m256i v) : val(v) {}
263 v_int32x8(int v0, int v1, int v2, int v3,
264 int v4, int v5, int v6, int v7)
265 {
266 val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
267 }
268 /* coverity[uninit_ctor]: suppress warning */
269 v_int32x8() {}
270
271 int get0() const { return _v_cvtsi256_si32(val); }
272};
273
274struct v_float32x8
275{
276 typedef float lane_type;
277 enum { nlanes = 8 };
278 __m256 val;
279
280 explicit v_float32x8(__m256 v) : val(v) {}
281 v_float32x8(float v0, float v1, float v2, float v3,
282 float v4, float v5, float v6, float v7)
283 {
284 val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
285 }
286 /* coverity[uninit_ctor]: suppress warning */
287 v_float32x8() {}
288
289 float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
290};
291
292struct v_uint64x4
293{
294 typedef uint64 lane_type;
295 enum { nlanes = 4 };
296 __m256i val;
297
298 explicit v_uint64x4(__m256i v) : val(v) {}
299 v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
300 { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
301 /* coverity[uninit_ctor]: suppress warning */
302 v_uint64x4() {}
303
304 uint64 get0() const
305 {
306 #if defined __x86_64__ || defined _M_X64
307 return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
308 #else
309 int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
310 int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
311 return (unsigned)a | ((uint64)(unsigned)b << 32);
312 #endif
313 }
314};
315
316struct v_int64x4
317{
318 typedef int64 lane_type;
319 enum { nlanes = 4 };
320 __m256i val;
321
322 explicit v_int64x4(__m256i v) : val(v) {}
323 v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
324 { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
325 /* coverity[uninit_ctor]: suppress warning */
326 v_int64x4() {}
327
328 int64 get0() const
329 {
330 #if defined __x86_64__ || defined _M_X64
331 return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
332 #else
333 int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
334 int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
335 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
336 #endif
337 }
338};
339
340struct v_float64x4
341{
342 typedef double lane_type;
343 enum { nlanes = 4 };
344 __m256d val;
345
346 explicit v_float64x4(__m256d v) : val(v) {}
347 v_float64x4(double v0, double v1, double v2, double v3)
348 { val = _mm256_setr_pd(v0, v1, v2, v3); }
349 /* coverity[uninit_ctor]: suppress warning */
350 v_float64x4() {}
351
352 double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
353};
354
356
357#define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp) \
358 inline _Tpvec v256_load(const _Tp* ptr) \
359 { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); } \
360 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
361 { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); } \
362 inline _Tpvec v256_load_low(const _Tp* ptr) \
363 { \
364 __m128i v128 = _mm_loadu_si128((const __m128i*)ptr); \
365 return _Tpvec(_mm256_castsi128_si256(v128)); \
366 } \
367 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
368 { \
369 __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0); \
370 __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1); \
371 return _Tpvec(_v256_combine(vlo, vhi)); \
372 } \
373 inline void v_store(_Tp* ptr, const _Tpvec& a) \
374 { _mm256_storeu_si256((__m256i*)ptr, a.val); } \
375 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
376 { _mm256_store_si256((__m256i*)ptr, a.val); } \
377 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
378 { _mm256_stream_si256((__m256i*)ptr, a.val); } \
379 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
380 { \
381 if( mode == hal::STORE_UNALIGNED ) \
382 _mm256_storeu_si256((__m256i*)ptr, a.val); \
383 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
384 _mm256_stream_si256((__m256i*)ptr, a.val); \
385 else \
386 _mm256_store_si256((__m256i*)ptr, a.val); \
387 } \
388 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
389 { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \
390 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
391 { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
392
393OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32, uchar)
394OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32, schar)
395OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
396OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16, short)
397OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8, unsigned)
398OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8, int)
399OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4, uint64)
400OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4, int64)
401
402#define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
403 inline _Tpvec v256_load(const _Tp* ptr) \
404 { return _Tpvec(_mm256_loadu_##suffix(ptr)); } \
405 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
406 { return _Tpvec(_mm256_load_##suffix(ptr)); } \
407 inline _Tpvec v256_load_low(const _Tp* ptr) \
408 { \
409 return _Tpvec(_mm256_cast##suffix##128_##suffix##256 \
410 (_mm_loadu_##suffix(ptr))); \
411 } \
412 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
413 { \
414 halfreg vlo = _mm_loadu_##suffix(ptr0); \
415 halfreg vhi = _mm_loadu_##suffix(ptr1); \
416 return _Tpvec(_v256_combine(vlo, vhi)); \
417 } \
418 inline void v_store(_Tp* ptr, const _Tpvec& a) \
419 { _mm256_storeu_##suffix(ptr, a.val); } \
420 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
421 { _mm256_store_##suffix(ptr, a.val); } \
422 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
423 { _mm256_stream_##suffix(ptr, a.val); } \
424 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
425 { \
426 if( mode == hal::STORE_UNALIGNED ) \
427 _mm256_storeu_##suffix(ptr, a.val); \
428 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
429 _mm256_stream_##suffix(ptr, a.val); \
430 else \
431 _mm256_store_##suffix(ptr, a.val); \
432 } \
433 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
434 { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \
435 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
436 { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
437
438OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float, ps, __m128)
439OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
440
441#define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
442 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
443 { return _Tpvec(cast(a.val)); }
444
445#define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
446 inline _Tpvec v256_setzero_##suffix() \
447 { return _Tpvec(_mm256_setzero_si256()); } \
448 inline _Tpvec v256_setall_##suffix(_Tp v) \
449 { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \
450 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
451 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
452 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
453 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
454 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
455 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
456 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
457 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
458 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256) \
459 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
460
461OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32, uchar, u8, epi8, char)
462OPENCV_HAL_IMPL_AVX_INIT(v_int8x32, schar, s8, epi8, char)
463OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort, u16, epi16, short)
464OPENCV_HAL_IMPL_AVX_INIT(v_int16x16, short, s16, epi16, short)
465OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8, unsigned, u32, epi32, int)
466OPENCV_HAL_IMPL_AVX_INIT(v_int32x8, int, s32, epi32, int)
467OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4, uint64, u64, epi64x, int64)
468OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64)
469
470#define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
471 inline _Tpvec v256_setzero_##suffix() \
472 { return _Tpvec(_mm256_setzero_##zsuffix()); } \
473 inline _Tpvec v256_setall_##suffix(_Tp v) \
474 { return _Tpvec(_mm256_set1_##zsuffix(v)); } \
475 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
476 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \
477 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
478 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, cast) \
479 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
480 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, cast) \
481 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
482 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, cast)
483
484OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float, f32, ps, _mm256_castsi256_ps)
485OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
486
487inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
488{ return a; }
489inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
490{ return v_float32x8(_mm256_castpd_ps(a.val)); }
491
492inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
493{ return a; }
494inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
495{ return v_float64x4(_mm256_castps_pd(a.val)); }
496
497/* Recombine */
498/*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm) \
499 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
500 { return _Tpvec(perm(a.val, b.val, 0x20)); } \
501 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
502 { return _Tpvec(perm(a.val, b.val, 0x31)); } \
503 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
504 _Tpvec& c, _Tpvec& d) \
505 { c = v_combine_low(a, b); d = v_combine_high(a, b); }
506
507#define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix) \
508 OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256) \
509 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, \
510 _Tpvec& b0, _Tpvec& b1) \
511 { \
512 __m256i v0 = _v256_shuffle_odd_64(a0.val); \
513 __m256i v1 = _v256_shuffle_odd_64(a1.val); \
514 b0.val = _mm256_unpacklo_##suffix(v0, v1); \
515 b1.val = _mm256_unpackhi_##suffix(v0, v1); \
516 }
517
518OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32, epi8)
519OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32, epi8)
520OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
521OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16, epi16)
522OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8, epi32)
523OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8, epi32)
524OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4, epi64)
525OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4, epi64)
526OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
527OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
528
529inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
530{
531 __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
532 __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
533 v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
534}
535
536inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
537{
538 __m256d v0 = _v_shuffle_odd_64(a0.val);
539 __m256d v1 = _v_shuffle_odd_64(a1.val);
540 b0.val = _mm256_unpacklo_pd(v0, v1);
541 b1.val = _mm256_unpackhi_pd(v0, v1);
542}*/
543
545
546// unpacks
547#define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix) \
548 inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
549 { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); } \
550 inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
551 { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
552
553OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32, epi8)
554OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32, epi8)
555OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
556OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16, epi16)
557OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8, epi32)
558OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8, epi32)
559OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4, epi64)
560OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4, epi64)
561OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
562OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
563
564// blend
565#define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix) \
566 template<int m> \
567 inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b) \
568 { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
569
570OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
571OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16, epi16)
572OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8, epi32)
573OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8, epi32)
574OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
575OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
576
577template<int m>
578inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
579{
580 enum {M0 = m};
581 enum {M1 = (M0 | (M0 << 2)) & 0x33};
582 enum {M2 = (M1 | (M1 << 1)) & 0x55};
583 enum {MM = M2 | (M2 << 1)};
584 return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
585}
586template<int m>
587inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
588{ return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
589
590// shuffle
591// todo: emulate 64bit
592#define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin) \
593 template<int m> \
594 inline _Tpvec v256_shuffle(const _Tpvec& a) \
595 { return _Tpvec(_mm256_##intrin(a.val, m)); }
596
597OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8, shuffle_epi32)
598OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8, shuffle_epi32)
599OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
600OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
601
602template<typename _Tpvec>
603inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
604{
605 ab0 = v256_unpacklo(a, b);
606 ab1 = v256_unpackhi(a, b);
607}
608
609template<typename _Tpvec>
610inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
611{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }
612
613inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
614{ return v256_blend<0xf0>(a, b); }
615
616inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
617{ return v256_blend<0xc>(a, b); }
618
619template<typename _Tpvec>
620inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
621{ return v256_permute2x128<0x21>(a, b); }
622
623template<typename _Tpvec>
624inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
625{ return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
626inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
627{ return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
628// todo: emulate float32
629
630template<typename _Tpvec>
631inline _Tpvec v256_swap_halves(const _Tpvec& a)
632{ return v256_permute2x128<1>(a, a); }
633
634template<typename _Tpvec>
635inline _Tpvec v256_reverse_64(const _Tpvec& a)
636{ return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
637
638// ZIP
639#define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec) \
640 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
641 { return v256_permute2x128<0x20>(a, b); } \
642 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
643 { return v256_permute2x128<0x31>(a, b); } \
644 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
645 _Tpvec& c, _Tpvec& d) \
646 { \
647 _Tpvec a1b0 = v256_alignr_128(a, b); \
648 c = v256_combine_diagonal(a, a1b0); \
649 d = v256_combine_diagonal(a1b0, b); \
650 } \
651 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
652 _Tpvec& ab0, _Tpvec& ab1) \
653 { \
654 _Tpvec ab0ab2, ab1ab3; \
655 v256_zip(a, b, ab0ab2, ab1ab3); \
656 v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
657 }
658
659OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
660OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
661OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
662OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
663OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
664OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
665OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
666OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
667OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
668OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
669
670
671
672/* Element-wise binary and unary operations */
673
674
675#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
676 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
677 { return _Tpvec(intrin(a.val, b.val)); } \
678 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
679 { a.val = intrin(a.val, b.val); return a; }
680
681OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
682OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
683OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
684OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
685OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
686OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
687OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
688OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
689OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
690OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
691OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
692OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
693OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
694OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
695OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
696OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
697OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
698OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
699
700OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
701OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
702OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
703OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
704OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
705OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
706OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
707OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
708
709// saturating multiply 8-bit, 16-bit
710inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
711{
712 v_uint16x16 c, d;
713 v_mul_expand(a, b, c, d);
714 return v_pack(c, d);
715}
716inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
717{
718 v_int16x16 c, d;
719 v_mul_expand(a, b, c, d);
720 return v_pack(c, d);
721}
722inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
723{
724 __m256i pl = _mm256_mullo_epi16(a.val, b.val);
725 __m256i ph = _mm256_mulhi_epu16(a.val, b.val);
726 __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
727 __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
728 return v_uint16x16(_v256_packs_epu32(p0, p1));
729}
730inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
731{
732 __m256i pl = _mm256_mullo_epi16(a.val, b.val);
733 __m256i ph = _mm256_mulhi_epi16(a.val, b.val);
734 __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
735 __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
736 return v_int16x16(_mm256_packs_epi32(p0, p1));
737}
738inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
739{ a = a * b; return a; }
740inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
741{ a = a * b; return a; }
742inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
743{ a = a * b; return a; }
744inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
745{ a = a * b; return a; }
746
748#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
749 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
750 { return _Tpvec(intrin(a.val, b.val)); }
751
752OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
753OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
754OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
755OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
756OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
757OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
758OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
759OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
760OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
761OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16, _mm256_mullo_epi16)
762
763inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
764{
765 __m256i ad = _mm256_srai_epi16(a.val, 8);
766 __m256i bd = _mm256_srai_epi16(b.val, 8);
767 __m256i p0 = _mm256_mullo_epi16(a.val, b.val); // even
768 __m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8); // odd
769
770 const __m256i b01 = _mm256_set1_epi32(0xFF00FF00);
771 return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
772}
773inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
774{
775 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
776}
777
778// Multiply and expand
779inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
780 v_uint16x16& c, v_uint16x16& d)
781{
782 v_uint16x16 a0, a1, b0, b1;
783 v_expand(a, a0, a1);
784 v_expand(b, b0, b1);
785 c = v_mul_wrap(a0, b0);
786 d = v_mul_wrap(a1, b1);
787}
788
789inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
790 v_int16x16& c, v_int16x16& d)
791{
792 v_int16x16 a0, a1, b0, b1;
793 v_expand(a, a0, a1);
794 v_expand(b, b0, b1);
795 c = v_mul_wrap(a0, b0);
796 d = v_mul_wrap(a1, b1);
797}
798
799inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
800 v_int32x8& c, v_int32x8& d)
801{
802 v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
803
804 v_int16x16 v0, v1;
805 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
806
807 c = v_reinterpret_as_s32(v0);
808 d = v_reinterpret_as_s32(v1);
809}
810
811inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
812 v_uint32x8& c, v_uint32x8& d)
813{
814 v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
815
816 v_uint16x16 v0, v1;
817 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
818
819 c = v_reinterpret_as_u32(v0);
820 d = v_reinterpret_as_u32(v1);
821}
822
823inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
824 v_uint64x4& c, v_uint64x4& d)
825{
826 __m256i v0 = _mm256_mul_epu32(a.val, b.val);
827 __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
828 v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
829}
830
831inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
832inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
833
835#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
836 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
837 { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
838 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
839 { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
840 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
841 { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
842 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
843 { return _Tpsvec(srai(a.val, imm)); } \
844 template<int imm> \
845 inline _Tpuvec v_shl(const _Tpuvec& a) \
846 { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
847 template<int imm> \
848 inline _Tpsvec v_shl(const _Tpsvec& a) \
849 { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
850 template<int imm> \
851 inline _Tpuvec v_shr(const _Tpuvec& a) \
852 { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
853 template<int imm> \
854 inline _Tpsvec v_shr(const _Tpsvec& a) \
855 { return _Tpsvec(srai(a.val, imm)); }
856
857OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
858OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8, v_int32x8, epi32, _mm256_srai_epi32)
859
860inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
861{
862 __m256i d = _mm256_set1_epi64x((int64)1 << 63);
863 __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
864 return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
865}
866OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx)
867
868
869
870#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
871 OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
872 OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
873 OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
874 inline _Tpvec operator ~ (const _Tpvec& a) \
875 { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
876
877OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
878OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32, si256, _mm256_set1_epi32(-1))
879OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16, si256, _mm256_set1_epi32(-1))
880OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16, si256, _mm256_set1_epi32(-1))
881OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8, si256, _mm256_set1_epi32(-1))
882OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8, si256, _mm256_set1_epi32(-1))
883OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4, si256, _mm256_set1_epi64x(-1))
884OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4, si256, _mm256_set1_epi64x(-1))
885OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8, ps, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
886OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4, pd, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
887
889#define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix) \
890 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
891 { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
892
893OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32, epi8)
894OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32, epi8)
895OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
896OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16, epi8)
897OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8, epi8)
898OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8, epi8)
899OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
900OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
901
902
903#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
904 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
905 { return ~(a == b); } \
906 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
907 { return b > a; } \
908 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
909 { return ~(a < b); } \
910 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
911 { return b >= a; }
912
913#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
914 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
915 { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
916 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
917 { \
918 __m256i smask = _mm256_set1_##suffix(sbit); \
919 return _Tpuvec(_mm256_cmpgt_##suffix( \
920 _mm256_xor_si256(a.val, smask), \
921 _mm256_xor_si256(b.val, smask))); \
922 } \
923 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
924 { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
925 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
926 { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
927 OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
928 OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
929
930OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32, v_int8x32, epi8, (char)-128)
931OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
932OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000)
933
934#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
935 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
936 { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
937 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
938 { return ~(a == b); }
939
940OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
941OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
942
943#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
944 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
945 { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
946
947#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
948 OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
949 OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
950 OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
951 OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
952 OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
953 OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
954
955OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
956OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
957
958inline v_float32x8 v_not_nan(const v_float32x8& a)
959{ return v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }
960inline v_float64x4 v_not_nan(const v_float64x4& a)
961{ return v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }
962
964OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8)
965OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8)
966OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32, _mm256_min_epi8)
967OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32, _mm256_max_epi8)
968OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
969OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
970OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16, _mm256_min_epi16)
971OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16, _mm256_max_epi16)
972OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8, _mm256_min_epu32)
973OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8, _mm256_max_epu32)
974OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8, _mm256_min_epi32)
975OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8, _mm256_max_epi32)
976OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
977OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
978OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
979OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
980
981
982template<int imm>
983inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
984{
985 enum {IMM_R = (16 - imm) & 0xFF};
986 enum {IMM_R2 = (32 - imm) & 0xFF};
987
988 if (imm == 0) return a;
989 if (imm == 32) return b;
990 if (imm > 32) return v_uint8x32();
991
992 __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
993 if (imm == 16) return v_uint8x32(swap);
994 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, IMM_R));
995 return v_uint8x32(_mm256_alignr_epi8(swap, b.val, IMM_R2)); // imm < 32
996}
997
998template<int imm>
999inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
1000{
1001 enum {IMM_L = (imm - 16) & 0xFF};
1002
1003 if (imm == 0) return a;
1004 if (imm == 32) return b;
1005 if (imm > 32) return v_uint8x32();
1006
1007 __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
1008 if (imm == 16) return v_uint8x32(swap);
1009 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
1010 return v_uint8x32(_mm256_alignr_epi8(b.val, swap, IMM_L));
1011}
1012
1013template<int imm>
1014inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
1015{
1016 enum {IMM_L = (imm - 16) & 0xFF};
1017 enum {IMM_R = (16 - imm) & 0xFF};
1018
1019 if (imm == 0) return a;
1020 if (imm > 32) return v_uint8x32();
1021
1022 // ESAC control[3] ? [127:0] = 0
1023 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
1024 if (imm == 16) return v_uint8x32(swapz);
1025 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));
1026 return v_uint8x32(_mm256_slli_si256(swapz, IMM_L));
1027}
1028
1029template<int imm>
1030inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
1031{
1032 enum {IMM_L = (imm - 16) & 0xFF};
1033
1034 if (imm == 0) return a;
1035 if (imm > 32) return v_uint8x32();
1036
1037 // ESAC control[3] ? [127:0] = 0
1038 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
1039 if (imm == 16) return v_uint8x32(swapz);
1040 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));
1041 return v_uint8x32(_mm256_srli_si256(swapz, IMM_L));
1042}
1043
1044#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \
1045 template<int imm> \
1046 inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
1047 { \
1048 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1049 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a), \
1050 v_reinterpret_as_u8(b)); \
1051 return _Tpvec(cast(ret.val)); \
1052 } \
1053 template<int imm> \
1054 inline _Tpvec intrin(const _Tpvec& a) \
1055 { \
1056 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1057 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a)); \
1058 return _Tpvec(cast(ret.val)); \
1059 }
1060
1061#define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec) \
1062 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
1063 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
1064
1065OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
1066OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
1067OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
1068OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
1069OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
1070OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
1071OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
1072
1073OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float32x8, _mm256_castsi256_ps)
1074OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
1075OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd)
1076OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
1077
1078
1079inline v_uint8x32 v_reverse(const v_uint8x32 &a)
1080{
1081 static const __m256i perm = _mm256_setr_epi8(
1082 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
1083 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1084 __m256i vec = _mm256_shuffle_epi8(a.val, perm);
1085 return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
1086}
1087
1088inline v_int8x32 v_reverse(const v_int8x32 &a)
1089{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1090
1091inline v_uint16x16 v_reverse(const v_uint16x16 &a)
1092{
1093 static const __m256i perm = _mm256_setr_epi8(
1094 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
1095 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1096 __m256i vec = _mm256_shuffle_epi8(a.val, perm);
1097 return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
1098}
1099
1100inline v_int16x16 v_reverse(const v_int16x16 &a)
1101{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1102
1103inline v_uint32x8 v_reverse(const v_uint32x8 &a)
1104{
1105 static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1106 return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
1107}
1108
1109inline v_int32x8 v_reverse(const v_int32x8 &a)
1110{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1111
1112inline v_float32x8 v_reverse(const v_float32x8 &a)
1113{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1114
1115inline v_uint64x4 v_reverse(const v_uint64x4 &a)
1116{
1117 return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
1118}
1119
1120inline v_int64x4 v_reverse(const v_int64x4 &a)
1121{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1122
1123inline v_float64x4 v_reverse(const v_float64x4 &a)
1124{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1125
1127
1129inline unsigned v_reduce_sum(const v_uint8x32& a)
1130{
1131 __m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
1132 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1133 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1134}
1135inline int v_reduce_sum(const v_int8x32& a)
1136{
1137 __m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
1138 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1139 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
1140}
1141#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
1142 inline sctype v_reduce_##func(const _Tpvec& a) \
1143 { \
1144 __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
1145 val = intrin(val, _mm_srli_si128(val,8)); \
1146 val = intrin(val, _mm_srli_si128(val,4)); \
1147 val = intrin(val, _mm_srli_si128(val,2)); \
1148 val = intrin(val, _mm_srli_si128(val,1)); \
1149 return (sctype)_mm_cvtsi128_si32(val); \
1150 }
1151
1152OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8)
1153OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, min, _mm_min_epi8)
1154OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8)
1155OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, max, _mm_max_epi8)
1156
1157#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1158 inline sctype v_reduce_##func(const _Tpvec& a) \
1159 { \
1160 __m128i v0 = _v256_extract_low(a.val); \
1161 __m128i v1 = _v256_extract_high(a.val); \
1162 v0 = intrin(v0, v1); \
1163 v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
1164 v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
1165 v0 = intrin(v0, _mm_srli_si128(v0, 2)); \
1166 return (sctype) _mm_cvtsi128_si32(v0); \
1167 }
1168
1169OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
1170OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, min, _mm_min_epi16)
1171OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
1172OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, max, _mm_max_epi16)
1173
1174#define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1175 inline sctype v_reduce_##func(const _Tpvec& a) \
1176 { \
1177 __m128i v0 = _v256_extract_low(a.val); \
1178 __m128i v1 = _v256_extract_high(a.val); \
1179 v0 = intrin(v0, v1); \
1180 v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
1181 v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
1182 return (sctype) _mm_cvtsi128_si32(v0); \
1183 }
1184
1185OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
1186OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, min, _mm_min_epi32)
1187OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
1188OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32)
1189
1190#define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin) \
1191 inline float v_reduce_##func(const v_float32x8& a) \
1192 { \
1193 __m128 v0 = _v256_extract_low(a.val); \
1194 __m128 v1 = _v256_extract_high(a.val); \
1195 v0 = intrin(v0, v1); \
1196 v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
1197 v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 1))); \
1198 return _mm_cvtss_f32(v0); \
1199 }
1200
1201OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
1202OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
1203
1204inline int v_reduce_sum(const v_int32x8& a)
1205{
1206 __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
1207 s0 = _mm256_hadd_epi32(s0, s0);
1208
1209 __m128i s1 = _v256_extract_high(s0);
1210 s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
1211
1212 return _mm_cvtsi128_si32(s1);
1213}
1214
1215inline unsigned v_reduce_sum(const v_uint32x8& a)
1216{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
1217
1218inline int v_reduce_sum(const v_int16x16& a)
1219{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1220inline unsigned v_reduce_sum(const v_uint16x16& a)
1221{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1222
1223inline float v_reduce_sum(const v_float32x8& a)
1224{
1225 __m256 s0 = _mm256_hadd_ps(a.val, a.val);
1226 s0 = _mm256_hadd_ps(s0, s0);
1227
1228 __m128 s1 = _v256_extract_high(s0);
1229 s1 = _mm_add_ps(_v256_extract_low(s0), s1);
1230
1231 return _mm_cvtss_f32(s1);
1232}
1233
1234inline uint64 v_reduce_sum(const v_uint64x4& a)
1235{
1236 uint64 CV_DECL_ALIGNED(32) idx[2];
1237 _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
1238 return idx[0] + idx[1];
1239}
1240inline int64 v_reduce_sum(const v_int64x4& a)
1241{
1242 int64 CV_DECL_ALIGNED(32) idx[2];
1243 _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
1244 return idx[0] + idx[1];
1245}
1246inline double v_reduce_sum(const v_float64x4& a)
1247{
1248 __m256d s0 = _mm256_hadd_pd(a.val, a.val);
1249 return _mm_cvtsd_f64(_mm_add_pd(_v256_extract_low(s0), _v256_extract_high(s0)));
1250}
1251
1252inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
1253 const v_float32x8& c, const v_float32x8& d)
1254{
1255 __m256 ab = _mm256_hadd_ps(a.val, b.val);
1256 __m256 cd = _mm256_hadd_ps(c.val, d.val);
1257 return v_float32x8(_mm256_hadd_ps(ab, cd));
1258}
1259
1260inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
1261{
1262 __m256i half = _mm256_sad_epu8(a.val, b.val);
1263 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1264 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1265}
1266inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
1267{
1268 __m256i half = _mm256_set1_epi8(0x7f);
1269 half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half));
1270 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1271 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1272}
1273inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
1274{
1275 v_uint32x8 l, h;
1276 v_expand(v_add_wrap(a - b, b - a), l, h);
1277 return v_reduce_sum(l + h);
1278}
1279inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
1280{
1281 v_uint32x8 l, h;
1282 v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
1283 return v_reduce_sum(l + h);
1284}
1285inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
1286{
1287 return v_reduce_sum(v_max(a, b) - v_min(a, b));
1288}
1289inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
1290{
1291 v_int32x8 m = a < b;
1292 return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
1293}
1294inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
1295{
1296 return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
1297}
1298
1300inline v_uint8x32 v_popcount(const v_uint8x32& a)
1301{
1302 __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1303 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
1304 __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
1305 return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256( a.val , _popcnt_mask)),
1306 _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
1307}
1308inline v_uint16x16 v_popcount(const v_uint16x16& a)
1309{
1310 v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
1311 p += v_rotate_right<1>(p);
1312 return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
1313}
1314inline v_uint32x8 v_popcount(const v_uint32x8& a)
1315{
1316 v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
1317 p += v_rotate_right<1>(p);
1318 p += v_rotate_right<2>(p);
1319 return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
1320}
1321inline v_uint64x4 v_popcount(const v_uint64x4& a)
1322{
1323 return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
1324}
1325inline v_uint8x32 v_popcount(const v_int8x32& a)
1326{ return v_popcount(v_reinterpret_as_u8(a)); }
1327inline v_uint16x16 v_popcount(const v_int16x16& a)
1328{ return v_popcount(v_reinterpret_as_u16(a)); }
1329inline v_uint32x8 v_popcount(const v_int32x8& a)
1330{ return v_popcount(v_reinterpret_as_u32(a)); }
1331inline v_uint64x4 v_popcount(const v_int64x4& a)
1332{ return v_popcount(v_reinterpret_as_u64(a)); }
1333
1335inline int v_signmask(const v_int8x32& a)
1336{ return _mm256_movemask_epi8(a.val); }
1337inline int v_signmask(const v_uint8x32& a)
1338{ return v_signmask(v_reinterpret_as_s8(a)); }
1339
1340inline int v_signmask(const v_int16x16& a)
1341{ return v_signmask(v_pack(a, a)) & 0xFFFF; }
1342inline int v_signmask(const v_uint16x16& a)
1343{ return v_signmask(v_reinterpret_as_s16(a)); }
1344
1345inline int v_signmask(const v_float32x8& a)
1346{ return _mm256_movemask_ps(a.val); }
1347inline int v_signmask(const v_float64x4& a)
1348{ return _mm256_movemask_pd(a.val); }
1349
1350inline int v_signmask(const v_int32x8& a)
1351{ return v_signmask(v_reinterpret_as_f32(a)); }
1352inline int v_signmask(const v_uint32x8& a)
1353{ return v_signmask(v_reinterpret_as_f32(a)); }
1354
1355inline int v_signmask(const v_int64x4& a)
1356{ return v_signmask(v_reinterpret_as_f64(a)); }
1357inline int v_signmask(const v_uint64x4& a)
1358{ return v_signmask(v_reinterpret_as_f64(a)); }
1359
1360inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1361inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1362inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1363inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1364inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1365inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1366inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1367inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1368inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1369inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1370
1372#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \
1373 inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1374 inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1375OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1)
1376OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1)
1377OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255)
1378OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255)
1379OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15)
1380OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15)
1381OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255)
1382OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15)
1383
1384#define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec) \
1385 inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
1386 inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
1387OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16)
1388OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
1389
1390
1391
1392
1393#if CV_FMA3
1394#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
1395 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1396 { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1397 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1398 { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }
1399#else
1400#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
1401 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1402 { return _Tpvec(_mm256_add_##suffix(_mm256_mul_##suffix(a.val, b.val), c.val)); } \
1403 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1404 { return _Tpvec(_mm256_add_##suffix(_mm256_mul_##suffix(a.val, b.val), c.val)); }
1405#endif
1406
1407#define OPENCV_HAL_IMPL_AVX_MISC(_Tpvec, suffix) \
1408 inline _Tpvec v_sqrt(const _Tpvec& x) \
1409 { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
1410 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1411 { return v_fma(a, a, b * b); } \
1412 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1413 { return v_sqrt(v_fma(a, a, b*b)); }
1414
1415OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
1416OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
1417OPENCV_HAL_IMPL_AVX_MISC(v_float32x8, ps)
1418OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)
1419
1420inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
1421{
1422 return a * b + c;
1423}
1424
1425inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
1426{
1427 return v_fma(a, b, c);
1428}
1429
1430inline v_float32x8 v_invsqrt(const v_float32x8& x)
1431{
1432 v_float32x8 half = x * v256_setall_f32(0.5);
1433 v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val));
1434 // todo: _mm256_fnmsub_ps
1435 t *= v256_setall_f32(1.5) - ((t * t) * half);
1436 return t;
1437}
1438
1439inline v_float64x4 v_invsqrt(const v_float64x4& x)
1440{
1441 return v256_setall_f64(1.) / v_sqrt(x);
1442}
1443
1445#define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix) \
1446 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1447 { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
1448
1449OPENCV_HAL_IMPL_AVX_ABS(int8x32, epi8)
1450OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
1451OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
1452
1453inline v_float32x8 v_abs(const v_float32x8& x)
1454{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
1455inline v_float64x4 v_abs(const v_float64x4& x)
1456{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
1457
1459inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
1460{ return v_add_wrap(a - b, b - a); }
1461inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
1462{ return v_add_wrap(a - b, b - a); }
1463inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
1464{ return v_max(a, b) - v_min(a, b); }
1465
1466inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
1467{
1468 v_int8x32 d = v_sub_wrap(a, b);
1469 v_int8x32 m = a < b;
1470 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1471}
1472
1473inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
1474{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1475
1476inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
1477{
1478 v_int32x8 d = a - b;
1479 v_int32x8 m = a < b;
1480 return v_reinterpret_as_u32((d ^ m) - m);
1481}
1482
1483inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
1484{ return v_abs(a - b); }
1485
1486inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
1487{ return v_abs(a - b); }
1488
1490inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
1491{
1492 v_int8x32 d = a - b;
1493 v_int8x32 m = a < b;
1494 return (d ^ m) - m;
1495}
1496inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
1497{ return v_max(a, b) - v_min(a, b); }
1498
1500
1502inline v_int32x8 v_round(const v_float32x8& a)
1503{ return v_int32x8(_mm256_cvtps_epi32(a.val)); }
1504
1505inline v_int32x8 v_round(const v_float64x4& a)
1506{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
1507
1508inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
1509{
1510 __m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
1511 return v_int32x8(_v256_combine(ai, bi));
1512}
1513
1514inline v_int32x8 v_trunc(const v_float32x8& a)
1515{ return v_int32x8(_mm256_cvttps_epi32(a.val)); }
1516
1517inline v_int32x8 v_trunc(const v_float64x4& a)
1518{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
1519
1520inline v_int32x8 v_floor(const v_float32x8& a)
1521{ return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
1522
1523inline v_int32x8 v_floor(const v_float64x4& a)
1524{ return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
1525
1526inline v_int32x8 v_ceil(const v_float32x8& a)
1527{ return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
1528
1529inline v_int32x8 v_ceil(const v_float64x4& a)
1530{ return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
1531
1533inline v_float32x8 v_cvt_f32(const v_int32x8& a)
1534{ return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
1535
1536inline v_float32x8 v_cvt_f32(const v_float64x4& a)
1537{ return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
1538
1539inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
1540{
1541 __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
1542 return v_float32x8(_v256_combine(af, bf));
1543}
1544
1545inline v_float64x4 v_cvt_f64(const v_int32x8& a)
1546{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
1547
1548inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
1549{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
1550
1551inline v_float64x4 v_cvt_f64(const v_float32x8& a)
1552{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
1553
1554inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
1555{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
1556
1557// from (Mysticial and wim) https://stackoverflow.com/q/41144668
1558inline v_float64x4 v_cvt_f64(const v_int64x4& v)
1559{
1560 // constants encoded as floating-point
1561 __m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); // 2^52
1562 __m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
1563 __m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
1564 __m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
1565
1566 // Blend the 32 lowest significant bits of v with magic_int_lo
1567 __m256i v_lo = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
1568 // Extract the 32 most significant bits of v
1569 __m256i v_hi = _mm256_srli_epi64(v.val, 32);
1570 // Flip the msb of v_hi and blend with 0x45300000
1571 v_hi = _mm256_xor_si256(v_hi, magic_i_hi32);
1572 // Compute in double precision
1573 __m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
1574 // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
1575 __m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
1576 return v_float64x4(result);
1577}
1578
1580
1581inline v_int8x32 v256_lut(const schar* tab, const int* idx)
1582{
1583 return v_int8x32(_mm256_setr_epi8(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
1584 tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]],
1585 tab[idx[16]], tab[idx[17]], tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
1586 tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], tab[idx[30]], tab[idx[31]]));
1587}
1588inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
1589{
1590 return v_int8x32(_mm256_setr_epi16(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]), *(const short*)(tab + idx[ 3]),
1591 *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]), *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]),
1592 *(const short*)(tab + idx[ 8]), *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
1593 *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]), *(const short*)(tab + idx[15])));
1594}
1595inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
1596{
1597 return v_int8x32(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 1));
1598}
1599inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
1600inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
1601inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
1602
1603inline v_int16x16 v256_lut(const short* tab, const int* idx)
1604{
1605 return v_int16x16(_mm256_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
1606 tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
1607}
1608inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
1609{
1610 return v_int16x16(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 2));
1611}
1612inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
1613{
1614#if defined(__GNUC__)
1615 return v_int16x16(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 2));//Looks like intrinsic has wrong definition
1616#else
1617 return v_int16x16(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 2));
1618#endif
1619}
1620inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
1621inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
1622inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
1623
1624inline v_int32x8 v256_lut(const int* tab, const int* idx)
1625{
1626 return v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
1627}
1628inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
1629{
1630#if defined(__GNUC__)
1631 return v_int32x8(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
1632#else
1633 return v_int32x8(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
1634#endif
1635}
1636inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
1637{
1638 return v_int32x8(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
1639}
1640inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
1641inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
1642inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
1643
1644inline v_int64x4 v256_lut(const int64* tab, const int* idx)
1645{
1646#if defined(__GNUC__)
1647 return v_int64x4(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 8));
1648#else
1649 return v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((const __m128i*)idx), 8));
1650#endif
1651}
1652inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
1653{
1654 return v_int64x4(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
1655}
1656inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
1657inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
1658
1659inline v_float32x8 v256_lut(const float* tab, const int* idx)
1660{
1661 return v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
1662}
1663inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
1664inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
1665
1666inline v_float64x4 v256_lut(const double* tab, const int* idx)
1667{
1668 return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8));
1669}
1670inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); }
1671
1672inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
1673{
1674 return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));
1675}
1676
1677inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
1678{
1679 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
1680}
1681
1682inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
1683{
1684 return v_float32x8(_mm256_i32gather_ps(tab, idxvec.val, 4));
1685}
1686
1687inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
1688{
1689 return v_float64x4(_mm256_i32gather_pd(tab, _mm256_castsi256_si128(idxvec.val), 8));
1690}
1691
1692inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
1693{
1694 int CV_DECL_ALIGNED(32) idx[8];
1695 v_store_aligned(idx, idxvec);
1696 __m128 z = _mm_setzero_ps();
1697 __m128 xy01, xy45, xy23, xy67;
1698 xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
1699 xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
1700 xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
1701 xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
1702 __m256 xy0145 = _v256_combine(xy01, xy45);
1703 xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
1704 xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
1705 xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
1706 xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
1707 __m256 xy2367 = _v256_combine(xy23, xy67);
1708
1709 __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
1710 __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
1711
1712 x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
1713 y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
1714}
1715
1716inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
1717{
1718 int CV_DECL_ALIGNED(32) idx[4];
1719 v_store_low(idx, idxvec);
1720 __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
1721 __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
1722 __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
1723 __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
1724 __m256d xy02 = _v256_combine(xy0, xy2);
1725 __m256d xy13 = _v256_combine(xy1, xy3);
1726
1727 x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
1728 y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
1729}
1730
1731inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
1732{
1733 return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
1734}
1735inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1736inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
1737{
1738 return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
1739}
1740inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1741
1742inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
1743{
1744 return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
1745}
1746inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1747inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
1748{
1749 return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
1750}
1751inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1752
1753inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
1754{
1755 return v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
1756}
1757inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1758inline v_float32x8 v_interleave_pairs(const v_float32x8& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1759
1760inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
1761{
1762 return v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))),
1763 _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1764}
1765inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1766
1767inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
1768{
1769 return v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))),
1770 _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1771}
1772inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1773
1774inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
1775{
1776 return v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1777}
1778inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
1779inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
1780{
1781 return v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1782}
1783
1785
1787
1788// 16 >> 32
1789inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
1790{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
1791inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1792{ return v_dotprod(a, b) + c; }
1793
1794// 32 >> 64
1795inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
1796{
1797 __m256i even = _mm256_mul_epi32(a.val, b.val);
1798 __m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
1799 return v_int64x4(_mm256_add_epi64(even, odd));
1800}
1801inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
1802{ return v_dotprod(a, b) + c; }
1803
1804// 8 >> 32
1805inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
1806{
1807 __m256i even_m = _mm256_set1_epi32(0xFF00FF00);
1808 __m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
1809 __m256i odd_a = _mm256_srli_epi16(a.val, 8);
1810
1811 __m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
1812 __m256i odd_b = _mm256_srli_epi16(b.val, 8);
1813
1814 __m256i prod0 = _mm256_madd_epi16(even_a, even_b);
1815 __m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
1816 return v_uint32x8(_mm256_add_epi32(prod0, prod1));
1817}
1818inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
1819{ return v_dotprod_expand(a, b) + c; }
1820
1821inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
1822{
1823 __m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
1824 __m256i odd_a = _mm256_srai_epi16(a.val, 8);
1825
1826 __m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
1827 __m256i odd_b = _mm256_srai_epi16(b.val, 8);
1828
1829 __m256i prod0 = _mm256_madd_epi16(even_a, even_b);
1830 __m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
1831 return v_int32x8(_mm256_add_epi32(prod0, prod1));
1832}
1833inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
1834{ return v_dotprod_expand(a, b) + c; }
1835
1836// 16 >> 64
1837inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
1838{
1839 __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
1840 __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
1841 __m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
1842 __m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
1843
1844 __m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
1845 __m256i p13 = _mm256_srli_epi64(mul0, 32);
1846 __m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
1847 __m256i p57 = _mm256_srli_epi64(mul1, 32);
1848
1849 __m256i p15_ = _mm256_add_epi64(p02, p13);
1850 __m256i p9d_ = _mm256_add_epi64(p46, p57);
1851
1852 return v_uint64x4(_mm256_add_epi64(
1853 _mm256_unpacklo_epi64(p15_, p9d_),
1854 _mm256_unpackhi_epi64(p15_, p9d_)
1855 ));
1856}
1857inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
1858{ return v_dotprod_expand(a, b) + c; }
1859
1860inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
1861{
1862 __m256i prod = _mm256_madd_epi16(a.val, b.val);
1863 __m256i sign = _mm256_srai_epi32(prod, 31);
1864
1865 __m256i lo = _mm256_unpacklo_epi32(prod, sign);
1866 __m256i hi = _mm256_unpackhi_epi32(prod, sign);
1867
1868 return v_int64x4(_mm256_add_epi64(
1869 _mm256_unpacklo_epi64(lo, hi),
1870 _mm256_unpackhi_epi64(lo, hi)
1871 ));
1872}
1873inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
1874{ return v_dotprod_expand(a, b) + c; }
1875
1876// 32 >> 64f
1877inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
1878{ return v_cvt_f64(v_dotprod(a, b)); }
1879inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
1880{ return v_dotprod_expand(a, b) + c; }
1881
1883
1884// 16 >> 32
1885inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
1886{ return v_dotprod(a, b); }
1887inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1888{ return v_dotprod(a, b, c); }
1889
1890// 32 >> 64
1891inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
1892{ return v_dotprod(a, b); }
1893inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
1894{ return v_dotprod(a, b, c); }
1895
1896// 8 >> 32
1897inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
1898{ return v_dotprod_expand(a, b); }
1899inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
1900{ return v_dotprod_expand(a, b, c); }
1901
1902inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
1903{ return v_dotprod_expand(a, b); }
1904inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
1905{ return v_dotprod_expand(a, b, c); }
1906
1907// 16 >> 64
1908inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
1909{
1910 __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
1911 __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
1912 __m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
1913 __m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
1914
1915 __m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
1916 __m256i p13 = _mm256_srli_epi64(mul0, 32);
1917 __m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
1918 __m256i p57 = _mm256_srli_epi64(mul1, 32);
1919
1920 __m256i p15_ = _mm256_add_epi64(p02, p13);
1921 __m256i p9d_ = _mm256_add_epi64(p46, p57);
1922
1923 return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
1924}
1925inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
1926{ return v_dotprod_expand_fast(a, b) + c; }
1927
1928inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
1929{
1930 __m256i prod = _mm256_madd_epi16(a.val, b.val);
1931 __m256i sign = _mm256_srai_epi32(prod, 31);
1932 __m256i lo = _mm256_unpacklo_epi32(prod, sign);
1933 __m256i hi = _mm256_unpackhi_epi32(prod, sign);
1934 return v_int64x4(_mm256_add_epi64(lo, hi));
1935}
1936inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
1937{ return v_dotprod_expand_fast(a, b) + c; }
1938
1939// 32 >> 64f
1940inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
1941{ return v_dotprod_expand(a, b); }
1942inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
1943{ return v_dotprod_expand(a, b, c); }
1944
1945#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
1946 v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1947
1948inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
1949 const v_float32x8& m1, const v_float32x8& m2,
1950 const v_float32x8& m3)
1951{
1952 v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1953 v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1954 v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1955 v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
1956 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
1957}
1958
1959inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
1960 const v_float32x8& m1, const v_float32x8& m2,
1961 const v_float32x8& a)
1962{
1963 v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1964 v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1965 v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1966 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
1967}
1968
1969#define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1970 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1971 const _Tpvec& a2, const _Tpvec& a3, \
1972 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1973 { \
1974 __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val)); \
1975 __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val)); \
1976 __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val)); \
1977 __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val)); \
1978 b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1)); \
1979 b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1)); \
1980 b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3)); \
1981 b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3)); \
1982 }
1983
1984OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1985OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1986OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
1987
1988
1989
1990/* Expand */
1991#define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1992 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1993 { \
1994 b0.val = intrin(_v256_extract_low(a.val)); \
1995 b1.val = intrin(_v256_extract_high(a.val)); \
1996 } \
1997 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1998 { return _Tpwvec(intrin(_v256_extract_low(a.val))); } \
1999 inline _Tpwvec v_expand_high(const _Tpvec& a) \
2000 { return _Tpwvec(intrin(_v256_extract_high(a.val))); } \
2001 inline _Tpwvec v256_load_expand(const _Tp* ptr) \
2002 { \
2003 __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
2004 return _Tpwvec(intrin(a)); \
2005 }
2006
2007OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32, v_uint16x16, uchar, _mm256_cvtepu8_epi16)
2008OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32, v_int16x16, schar, _mm256_cvtepi8_epi16)
2009OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8, ushort, _mm256_cvtepu16_epi32)
2010OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16, v_int32x8, short, _mm256_cvtepi16_epi32)
2011OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4, unsigned, _mm256_cvtepu32_epi64)
2012OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4, int, _mm256_cvtepi32_epi64)
2013
2014#define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin) \
2015 inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
2016 { \
2017 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
2018 return _Tpvec(intrin(a)); \
2019 }
2020
2021OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
2022OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8, schar, _mm256_cvtepi8_epi32)
2023
2024/* pack */
2025// 16
2026inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
2027{ return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
2028
2029inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
2030{
2031 __m256i t = _mm256_set1_epi16(255);
2032 __m256i a1 = _mm256_min_epu16(a.val, t);
2033 __m256i b1 = _mm256_min_epu16(b.val, t);
2034 return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
2035}
2036
2037inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
2038{
2039 return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
2040}
2041
2042inline void v_pack_store(schar* ptr, const v_int16x16& a)
2043{ v_store_low(ptr, v_pack(a, a)); }
2044
2045inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
2046{
2047 const __m256i m = _mm256_set1_epi16(255);
2048 __m256i am = _mm256_min_epu16(a.val, m);
2049 am = _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
2050 v_store_low(ptr, v_uint8x32(am));
2051}
2052
2053inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
2054{ v_store_low(ptr, v_pack_u(a, a)); }
2055
2056template<int n> inline
2057v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
2058{
2059 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
2060 v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
2061 return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
2062 v_reinterpret_as_s16((b + delta) >> n));
2063}
2064
2065template<int n> inline
2066void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
2067{
2068 v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
2069 v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
2070}
2071
2072template<int n> inline
2073v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
2074{
2075 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2076 return v_pack_u((a + delta) >> n, (b + delta) >> n);
2077}
2078
2079template<int n> inline
2080void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
2081{
2082 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2083 v_pack_u_store(ptr, (a + delta) >> n);
2084}
2085
2086template<int n> inline
2087v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
2088{
2089 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2090 return v_pack((a + delta) >> n, (b + delta) >> n);
2091}
2092
2093template<int n> inline
2094void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
2095{
2096 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
2097 v_pack_store(ptr, (a + delta) >> n);
2098}
2099
2100// 32
2101inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
2102{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
2103
2104inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
2105{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
2106
2107inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
2108{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
2109
2110inline void v_pack_store(short* ptr, const v_int32x8& a)
2111{ v_store_low(ptr, v_pack(a, a)); }
2112
2113inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
2114{
2115 const __m256i m = _mm256_set1_epi32(65535);
2116 __m256i am = _mm256_min_epu32(a.val, m);
2117 am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
2118 v_store_low(ptr, v_uint16x16(am));
2119}
2120
2121inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
2122{ v_store_low(ptr, v_pack_u(a, a)); }
2123
2124
2125template<int n> inline
2126v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
2127{
2128 // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
2129 v_uint32x8 delta = v256_setall_u32(1 << (n-1));
2130 return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
2131 v_reinterpret_as_s32((b + delta) >> n));
2132}
2133
2134template<int n> inline
2135void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
2136{
2137 v_uint32x8 delta = v256_setall_u32(1 << (n-1));
2138 v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
2139}
2140
2141template<int n> inline
2142v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
2143{
2144 v_int32x8 delta = v256_setall_s32(1 << (n-1));
2145 return v_pack_u((a + delta) >> n, (b + delta) >> n);
2146}
2147
2148template<int n> inline
2149void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
2150{
2151 v_int32x8 delta = v256_setall_s32(1 << (n-1));
2152 v_pack_u_store(ptr, (a + delta) >> n);
2153}
2154
2155template<int n> inline
2156v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
2157{
2158 v_int32x8 delta = v256_setall_s32(1 << (n-1));
2159 return v_pack((a + delta) >> n, (b + delta) >> n);
2160}
2161
2162template<int n> inline
2163void v_rshr_pack_store(short* ptr, const v_int32x8& a)
2164{
2165 v_int32x8 delta = v256_setall_s32(1 << (n-1));
2166 v_pack_store(ptr, (a + delta) >> n);
2167}
2168
2169// 64
2170// Non-saturating pack
2171inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
2172{
2173 __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
2174 __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
2175 __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
2176 return v_uint32x8(_v256_shuffle_odd_64(ab));
2177}
2178
2179inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
2180{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2181
2182inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
2183{
2184 __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
2185 v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
2186}
2187
2188inline void v_pack_store(int* ptr, const v_int64x4& b)
2189{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
2190
2191template<int n> inline
2192v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
2193{
2194 v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
2195 return v_pack((a + delta) >> n, (b + delta) >> n);
2196}
2197
2198template<int n> inline
2199void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
2200{
2201 v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
2202 v_pack_store(ptr, (a + delta) >> n);
2203}
2204
2205template<int n> inline
2206v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
2207{
2208 v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
2209 return v_pack((a + delta) >> n, (b + delta) >> n);
2210}
2211
2212template<int n> inline
2213void v_rshr_pack_store(int* ptr, const v_int64x4& a)
2214{
2215 v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
2216 v_pack_store(ptr, (a + delta) >> n);
2217}
2218
2219// pack boolean
2220inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
2221{
2222 __m256i ab = _mm256_packs_epi16(a.val, b.val);
2223 return v_uint8x32(_v256_shuffle_odd_64(ab));
2224}
2225
2226inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
2227 const v_uint32x8& c, const v_uint32x8& d)
2228{
2229 __m256i ab = _mm256_packs_epi32(a.val, b.val);
2230 __m256i cd = _mm256_packs_epi32(c.val, d.val);
2231
2232 __m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
2233 return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
2234}
2235
2236inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
2237 const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
2238 const v_uint64x4& g, const v_uint64x4& h)
2239{
2240 __m256i ab = _mm256_packs_epi32(a.val, b.val);
2241 __m256i cd = _mm256_packs_epi32(c.val, d.val);
2242 __m256i ef = _mm256_packs_epi32(e.val, f.val);
2243 __m256i gh = _mm256_packs_epi32(g.val, h.val);
2244
2245 __m256i abcd = _mm256_packs_epi32(ab, cd);
2246 __m256i efgh = _mm256_packs_epi32(ef, gh);
2247 __m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
2248
2249 __m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
2250 return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
2251}
2252
2253/* Recombine */
2254// its up there with load and store operations
2255
2256/* Extract */
2257#define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec) \
2258 template<int s> \
2259 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2260 { return v_rotate_right<s>(a, b); }
2261
2262OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
2263OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
2264OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
2265OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
2266OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
2267OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
2268OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
2269OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
2270OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
2271OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
2272
2273template<int i>
2274inline uchar v_extract_n(v_uint8x32 a)
2275{
2276 return (uchar)_v256_extract_epi8<i>(a.val);
2277}
2278
2279template<int i>
2280inline schar v_extract_n(v_int8x32 a)
2281{
2282 return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
2283}
2284
2285template<int i>
2286inline ushort v_extract_n(v_uint16x16 a)
2287{
2288 return (ushort)_v256_extract_epi16<i>(a.val);
2289}
2290
2291template<int i>
2292inline short v_extract_n(v_int16x16 a)
2293{
2294 return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
2295}
2296
2297template<int i>
2298inline uint v_extract_n(v_uint32x8 a)
2299{
2300 return (uint)_v256_extract_epi32<i>(a.val);
2301}
2302
2303template<int i>
2304inline int v_extract_n(v_int32x8 a)
2305{
2306 return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
2307}
2308
2309template<int i>
2310inline uint64 v_extract_n(v_uint64x4 a)
2311{
2312 return (uint64)_v256_extract_epi64<i>(a.val);
2313}
2314
2315template<int i>
2316inline int64 v_extract_n(v_int64x4 v)
2317{
2318 return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
2319}
2320
2321template<int i>
2322inline float v_extract_n(v_float32x8 v)
2323{
2324 union { uint iv; float fv; } d;
2325 d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
2326 return d.fv;
2327}
2328
2329template<int i>
2330inline double v_extract_n(v_float64x4 v)
2331{
2332 union { uint64 iv; double dv; } d;
2333 d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
2334 return d.dv;
2335}
2336
2337template<int i>
2338inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
2339{
2340 static const __m256i perm = _mm256_set1_epi32((char)i);
2341 return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
2342}
2343
2344template<int i>
2345inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
2346{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2347
2348template<int i>
2349inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
2350{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2351
2352
2354
2355inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
2356{
2357 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2358 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2359
2360 const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
2361 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
2362 __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
2363 __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
2364 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2365 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2366 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2367 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2368 a = v_uint8x32(a0);
2369 b = v_uint8x32(b0);
2370}
2371
2372inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
2373{
2374 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2375 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2376
2377 const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2378 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
2379 __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
2380 __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
2381 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2382 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2383 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2384 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2385 a = v_uint16x16(a0);
2386 b = v_uint16x16(b0);
2387}
2388
2389inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
2390{
2391 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2392 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2393
2394 enum { sh = 0+2*4+1*16+3*64 };
2395 __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
2396 __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
2397 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2398 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2399 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2400 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2401 a = v_uint32x8(a0);
2402 b = v_uint32x8(b0);
2403}
2404
2405inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
2406{
2407 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
2408 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
2409
2410 __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
2411 __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
2412 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2413 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2414 a = v_uint64x4(a0);
2415 b = v_uint64x4(b0);
2416}
2417
2418inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
2419{
2420 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2421 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2422 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
2423
2424 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2425 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2426
2427 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2428 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2429 const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2430 -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
2431
2432 __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
2433 __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
2434 __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
2435
2436 const __m256i
2437 sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
2438 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
2439 sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
2440 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
2441 sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
2442 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2443 b0 = _mm256_shuffle_epi8(b0, sh_b);
2444 g0 = _mm256_shuffle_epi8(g0, sh_g);
2445 r0 = _mm256_shuffle_epi8(r0, sh_r);
2446
2447 a = v_uint8x32(b0);
2448 b = v_uint8x32(g0);
2449 c = v_uint8x32(r0);
2450}
2451
2452inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
2453{
2454 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2455 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2456 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2457
2458 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2459 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2460
2461 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2462 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2463 const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2464 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2465 __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
2466 __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
2467 __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
2468 const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2469 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2470 const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2471 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2472 const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2473 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2474 b0 = _mm256_shuffle_epi8(b0, sh_b);
2475 g0 = _mm256_shuffle_epi8(g0, sh_g);
2476 r0 = _mm256_shuffle_epi8(r0, sh_r);
2477
2478 a = v_uint16x16(b0);
2479 b = v_uint16x16(g0);
2480 c = v_uint16x16(r0);
2481}
2482
2483inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
2484{
2485 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2486 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2487 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2488
2489 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2490 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2491
2492 __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
2493 __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
2494 __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
2495
2496 b0 = _mm256_shuffle_epi32(b0, 0x6c);
2497 g0 = _mm256_shuffle_epi32(g0, 0xb1);
2498 r0 = _mm256_shuffle_epi32(r0, 0xc6);
2499
2500 a = v_uint32x8(b0);
2501 b = v_uint32x8(g0);
2502 c = v_uint32x8(r0);
2503}
2504
2505inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
2506{
2507 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2508 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
2509 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2510
2511 __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
2512 __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
2513 __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
2514 __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
2515 __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
2516 __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
2517
2518 a = v_uint64x4(b0);
2519 b = v_uint64x4(g0);
2520 c = v_uint64x4(r0);
2521}
2522
2523inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )
2524{
2525 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2526 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2527 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
2528 __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
2529 const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
2530 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
2531
2532 __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
2533 __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
2534 __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
2535 __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
2536
2537 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2538 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2539 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2540 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2541
2542 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2543 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2544 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2545 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2546
2547 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2548 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2549 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2550 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2551
2552 a = v_uint8x32(b0);
2553 b = v_uint8x32(g0);
2554 c = v_uint8x32(r0);
2555 d = v_uint8x32(a0);
2556}
2557
2558inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )
2559{
2560 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
2561 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2562 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
2563 __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
2564 const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
2565 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
2566 __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
2567 __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
2568 __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
2569 __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
2570
2571 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2572 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2573 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2574 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2575
2576 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2577 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2578 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2579 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2580
2581 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2582 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2583 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2584 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2585
2586 a = v_uint16x16(b0);
2587 b = v_uint16x16(g0);
2588 c = v_uint16x16(r0);
2589 d = v_uint16x16(a0);
2590}
2591
2592inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
2593{
2594 __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
2595 __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2596 __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
2597 __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));
2598
2599 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2600 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2601 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2602 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2603
2604 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2605 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2606 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2607 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2608
2609 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2610 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2611 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2612 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2613
2614 a = v_uint32x8(b0);
2615 b = v_uint32x8(g0);
2616 c = v_uint32x8(r0);
2617 d = v_uint32x8(a0);
2618}
2619
2620inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
2621{
2622 __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
2623 __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
2624 __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
2625 __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));
2626
2627 __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
2628 __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
2629 __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
2630 __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
2631
2632 __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
2633 __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
2634 __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
2635 __m256i a0 = _mm256_unpackhi_epi64(h02, h13);
2636
2637 a = v_uint64x4(b0);
2638 b = v_uint64x4(g0);
2639 c = v_uint64x4(r0);
2640 d = v_uint64x4(a0);
2641}
2642
2644
2645inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
2647{
2648 __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
2649 __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
2650
2651 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2652 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2653
2654 if( mode == hal::STORE_ALIGNED_NOCACHE )
2655 {
2656 _mm256_stream_si256((__m256i*)ptr, xy0);
2657 _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
2658 }
2659 else if( mode == hal::STORE_ALIGNED )
2660 {
2661 _mm256_store_si256((__m256i*)ptr, xy0);
2662 _mm256_store_si256((__m256i*)(ptr + 32), xy1);
2663 }
2664 else
2665 {
2666 _mm256_storeu_si256((__m256i*)ptr, xy0);
2667 _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
2668 }
2669}
2670
2671inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
2673{
2674 __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
2675 __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
2676
2677 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2678 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2679
2680 if( mode == hal::STORE_ALIGNED_NOCACHE )
2681 {
2682 _mm256_stream_si256((__m256i*)ptr, xy0);
2683 _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
2684 }
2685 else if( mode == hal::STORE_ALIGNED )
2686 {
2687 _mm256_store_si256((__m256i*)ptr, xy0);
2688 _mm256_store_si256((__m256i*)(ptr + 16), xy1);
2689 }
2690 else
2691 {
2692 _mm256_storeu_si256((__m256i*)ptr, xy0);
2693 _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
2694 }
2695}
2696
2697inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
2699{
2700 __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
2701 __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
2702
2703 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2704 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2705
2706 if( mode == hal::STORE_ALIGNED_NOCACHE )
2707 {
2708 _mm256_stream_si256((__m256i*)ptr, xy0);
2709 _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
2710 }
2711 else if( mode == hal::STORE_ALIGNED )
2712 {
2713 _mm256_store_si256((__m256i*)ptr, xy0);
2714 _mm256_store_si256((__m256i*)(ptr + 8), xy1);
2715 }
2716 else
2717 {
2718 _mm256_storeu_si256((__m256i*)ptr, xy0);
2719 _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
2720 }
2721}
2722
2723inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
2725{
2726 __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
2727 __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
2728
2729 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2730 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2731
2732 if( mode == hal::STORE_ALIGNED_NOCACHE )
2733 {
2734 _mm256_stream_si256((__m256i*)ptr, xy0);
2735 _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
2736 }
2737 else if( mode == hal::STORE_ALIGNED )
2738 {
2739 _mm256_store_si256((__m256i*)ptr, xy0);
2740 _mm256_store_si256((__m256i*)(ptr + 4), xy1);
2741 }
2742 else
2743 {
2744 _mm256_storeu_si256((__m256i*)ptr, xy0);
2745 _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
2746 }
2747}
2748
2749inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
2751{
2752 const __m256i sh_b = _mm256_setr_epi8(
2753 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2754 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2755 const __m256i sh_g = _mm256_setr_epi8(
2756 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2757 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2758 const __m256i sh_r = _mm256_setr_epi8(
2759 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2760 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2761
2762 __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
2763 __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
2764 __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
2765
2766 const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2767 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2768 const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2769 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2770
2771 __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2772 __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2773 __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2774
2775 __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2776 __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
2777 __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
2778
2779 if( mode == hal::STORE_ALIGNED_NOCACHE )
2780 {
2781 _mm256_stream_si256((__m256i*)ptr, bgr0);
2782 _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
2783 _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
2784 }
2785 else if( mode == hal::STORE_ALIGNED )
2786 {
2787 _mm256_store_si256((__m256i*)ptr, bgr0);
2788 _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
2789 _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
2790 }
2791 else
2792 {
2793 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2794 _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
2795 _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
2796 }
2797}
2798
2799inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
2801{
2802 const __m256i sh_b = _mm256_setr_epi8(
2803 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2804 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2805 const __m256i sh_g = _mm256_setr_epi8(
2806 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2807 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2808 const __m256i sh_r = _mm256_setr_epi8(
2809 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2810 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2811
2812 __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
2813 __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
2814 __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
2815
2816 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2817 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2818 const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2819 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2820
2821 __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2822 __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2823 __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2824
2825 __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
2826 //__m256i bgr1 = p1;
2827 __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
2828
2829 if( mode == hal::STORE_ALIGNED_NOCACHE )
2830 {
2831 _mm256_stream_si256((__m256i*)ptr, bgr0);
2832 _mm256_stream_si256((__m256i*)(ptr + 16), p1);
2833 _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
2834 }
2835 else if( mode == hal::STORE_ALIGNED )
2836 {
2837 _mm256_store_si256((__m256i*)ptr, bgr0);
2838 _mm256_store_si256((__m256i*)(ptr + 16), p1);
2839 _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
2840 }
2841 else
2842 {
2843 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2844 _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
2845 _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
2846 }
2847}
2848
2849inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
2851{
2852 __m256i b0 = _mm256_shuffle_epi32(a.val, 0x6c);
2853 __m256i g0 = _mm256_shuffle_epi32(b.val, 0xb1);
2854 __m256i r0 = _mm256_shuffle_epi32(c.val, 0xc6);
2855
2856 __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
2857 __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
2858 __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
2859
2860 __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2861 //__m256i bgr1 = p2;
2862 __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2863
2864 if( mode == hal::STORE_ALIGNED_NOCACHE )
2865 {
2866 _mm256_stream_si256((__m256i*)ptr, bgr0);
2867 _mm256_stream_si256((__m256i*)(ptr + 8), p2);
2868 _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
2869 }
2870 else if( mode == hal::STORE_ALIGNED )
2871 {
2872 _mm256_store_si256((__m256i*)ptr, bgr0);
2873 _mm256_store_si256((__m256i*)(ptr + 8), p2);
2874 _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
2875 }
2876 else
2877 {
2878 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2879 _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
2880 _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
2881 }
2882}
2883
2884inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
2886{
2887 __m256i s01 = _mm256_unpacklo_epi64(a.val, b.val);
2888 __m256i s12 = _mm256_unpackhi_epi64(b.val, c.val);
2889 __m256i s20 = _mm256_blend_epi32(c.val, a.val, 0xcc);
2890
2891 __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
2892 __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
2893 __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
2894
2895 if( mode == hal::STORE_ALIGNED_NOCACHE )
2896 {
2897 _mm256_stream_si256((__m256i*)ptr, bgr0);
2898 _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
2899 _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
2900 }
2901 else if( mode == hal::STORE_ALIGNED )
2902 {
2903 _mm256_store_si256((__m256i*)ptr, bgr0);
2904 _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
2905 _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
2906 }
2907 else
2908 {
2909 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2910 _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
2911 _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
2912 }
2913}
2914
2915inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
2916 const v_uint8x32& c, const v_uint8x32& d,
2918{
2919 __m256i bg0 = _mm256_unpacklo_epi8(a.val, b.val);
2920 __m256i bg1 = _mm256_unpackhi_epi8(a.val, b.val);
2921 __m256i ra0 = _mm256_unpacklo_epi8(c.val, d.val);
2922 __m256i ra1 = _mm256_unpackhi_epi8(c.val, d.val);
2923
2924 __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
2925 __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
2926 __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
2927 __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
2928
2929 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2930 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2931 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2932 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2933
2934 if( mode == hal::STORE_ALIGNED_NOCACHE )
2935 {
2936 _mm256_stream_si256((__m256i*)ptr, bgra0);
2937 _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
2938 _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
2939 _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
2940 }
2941 else if( mode == hal::STORE_ALIGNED )
2942 {
2943 _mm256_store_si256((__m256i*)ptr, bgra0);
2944 _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
2945 _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
2946 _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
2947 }
2948 else
2949 {
2950 _mm256_storeu_si256((__m256i*)ptr, bgra0);
2951 _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
2952 _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
2953 _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
2954 }
2955}
2956
2957inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
2958 const v_uint16x16& c, const v_uint16x16& d,
2960{
2961 __m256i bg0 = _mm256_unpacklo_epi16(a.val, b.val);
2962 __m256i bg1 = _mm256_unpackhi_epi16(a.val, b.val);
2963 __m256i ra0 = _mm256_unpacklo_epi16(c.val, d.val);
2964 __m256i ra1 = _mm256_unpackhi_epi16(c.val, d.val);
2965
2966 __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
2967 __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
2968 __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
2969 __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
2970
2971 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2972 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2973 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2974 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2975
2976 if( mode == hal::STORE_ALIGNED_NOCACHE )
2977 {
2978 _mm256_stream_si256((__m256i*)ptr, bgra0);
2979 _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
2980 _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
2981 _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
2982 }
2983 else if( mode == hal::STORE_ALIGNED )
2984 {
2985 _mm256_store_si256((__m256i*)ptr, bgra0);
2986 _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
2987 _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
2988 _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
2989 }
2990 else
2991 {
2992 _mm256_storeu_si256((__m256i*)ptr, bgra0);
2993 _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
2994 _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
2995 _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
2996 }
2997}
2998
2999inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
3000 const v_uint32x8& c, const v_uint32x8& d,
3002{
3003 __m256i bg0 = _mm256_unpacklo_epi32(a.val, b.val);
3004 __m256i bg1 = _mm256_unpackhi_epi32(a.val, b.val);
3005 __m256i ra0 = _mm256_unpacklo_epi32(c.val, d.val);
3006 __m256i ra1 = _mm256_unpackhi_epi32(c.val, d.val);
3007
3008 __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
3009 __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
3010 __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
3011 __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
3012
3013 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
3014 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
3015 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
3016 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
3017
3018 if( mode == hal::STORE_ALIGNED_NOCACHE )
3019 {
3020 _mm256_stream_si256((__m256i*)ptr, bgra0);
3021 _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
3022 _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
3023 _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
3024 }
3025 else if( mode == hal::STORE_ALIGNED )
3026 {
3027 _mm256_store_si256((__m256i*)ptr, bgra0);
3028 _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
3029 _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
3030 _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
3031 }
3032 else
3033 {
3034 _mm256_storeu_si256((__m256i*)ptr, bgra0);
3035 _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
3036 _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
3037 _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
3038 }
3039}
3040
3041inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
3042 const v_uint64x4& c, const v_uint64x4& d,
3044{
3045 __m256i bg0 = _mm256_unpacklo_epi64(a.val, b.val);
3046 __m256i bg1 = _mm256_unpackhi_epi64(a.val, b.val);
3047 __m256i ra0 = _mm256_unpacklo_epi64(c.val, d.val);
3048 __m256i ra1 = _mm256_unpackhi_epi64(c.val, d.val);
3049
3050 __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
3051 __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
3052 __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
3053 __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
3054
3055 if( mode == hal::STORE_ALIGNED_NOCACHE )
3056 {
3057 _mm256_stream_si256((__m256i*)ptr, bgra0);
3058 _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
3059 _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
3060 _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
3061 }
3062 else if( mode == hal::STORE_ALIGNED )
3063 {
3064 _mm256_store_si256((__m256i*)ptr, bgra0);
3065 _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
3066 _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
3067 _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
3068 }
3069 else
3070 {
3071 _mm256_storeu_si256((__m256i*)ptr, bgra0);
3072 _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
3073 _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
3074 _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
3075 }
3076}
3077
3078#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
3079inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
3080{ \
3081 _Tpvec1 a1, b1; \
3082 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
3083 a0 = v_reinterpret_as_##suffix0(a1); \
3084 b0 = v_reinterpret_as_##suffix0(b1); \
3085} \
3086inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
3087{ \
3088 _Tpvec1 a1, b1, c1; \
3089 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
3090 a0 = v_reinterpret_as_##suffix0(a1); \
3091 b0 = v_reinterpret_as_##suffix0(b1); \
3092 c0 = v_reinterpret_as_##suffix0(c1); \
3093} \
3094inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
3095{ \
3096 _Tpvec1 a1, b1, c1, d1; \
3097 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
3098 a0 = v_reinterpret_as_##suffix0(a1); \
3099 b0 = v_reinterpret_as_##suffix0(b1); \
3100 c0 = v_reinterpret_as_##suffix0(c1); \
3101 d0 = v_reinterpret_as_##suffix0(d1); \
3102} \
3103inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3104 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3105{ \
3106 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3107 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3108 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
3109} \
3110inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
3111 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3112{ \
3113 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3114 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3115 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3116 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
3117} \
3118inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3119 const _Tpvec0& c0, const _Tpvec0& d0, \
3120 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3121{ \
3122 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3123 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3124 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3125 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
3126 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3127}
3128
3129OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
3130OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
3131OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
3132OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
3133OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
3134OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
3135
3136//
3137// FP16
3138//
3139
3140inline v_float32x8 v256_load_expand(const hfloat* ptr)
3141{
3142#if CV_FP16
3143 return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
3144#else
3145 float CV_DECL_ALIGNED(32) buf[8];
3146 for (int i = 0; i < 8; i++)
3147 buf[i] = (float)ptr[i];
3148 return v256_load_aligned(buf);
3149#endif
3150}
3151
3152inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
3153{
3154#if CV_FP16
3155 __m128i ah = _mm256_cvtps_ph(a.val, 0);
3156 _mm_storeu_si128((__m128i*)ptr, ah);
3157#else
3158 float CV_DECL_ALIGNED(32) buf[8];
3159 v_store_aligned(buf, a);
3160 for (int i = 0; i < 8; i++)
3161 ptr[i] = hfloat(buf[i]);
3162#endif
3163}
3164
3165//
3166// end of FP16
3167//
3168
3169inline void v256_cleanup() { _mm256_zeroall(); }
3170
3171CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3172
3174
3175} // cv::
3176
3177#endif // OPENCV_HAL_INTRIN_AVX_HPP
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr const CvArr CvArr * result
Definition core_c.h:1423
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition intrin_cpp.hpp:1554
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition intrin_cpp.hpp:2216
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition intrin_cpp.hpp:1496
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition intrin_cpp.hpp:1474
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition intrin_cpp.hpp:1515
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
CvRect r
Definition imgproc_c.h:984
CvSize int int int CvPoint int delta
Definition imgproc_c.h:1168
StoreMode
Definition intrin.hpp:100
@ STORE_ALIGNED_NOCACHE
Definition intrin.hpp:103
@ STORE_ALIGNED
Definition intrin.hpp:102
@ STORE_UNALIGNED
Definition intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition dualquaternion.inl.hpp:274
T swap(T... args)