5 #ifndef OPENCV_HAL_INTRIN_AVX_HPP
6 #define OPENCV_HAL_INTRIN_AVX_HPP
9 #define CV_SIMD256_64F 1
10 #define CV_SIMD256_FP16 0
17 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
21 inline __m256i _v256_combine(
const __m128i& lo,
const __m128i& hi)
22 {
return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
24 inline __m256 _v256_combine(
const __m128& lo,
const __m128& hi)
25 {
return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
27 inline __m256d _v256_combine(
const __m128d& lo,
const __m128d& hi)
28 {
return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
30 inline int _v_cvtsi256_si32(
const __m256i& a)
31 {
return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
33 inline __m256i _v256_shuffle_odd_64(
const __m256i& v)
34 {
return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
36 inline __m256d _v256_shuffle_odd_64(
const __m256d& v)
37 {
return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
40 inline __m256i _v256_permute2x128(
const __m256i& a,
const __m256i& b)
41 {
return _mm256_permute2x128_si256(a, b, imm); }
44 inline __m256 _v256_permute2x128(
const __m256& a,
const __m256& b)
45 {
return _mm256_permute2f128_ps(a, b, imm); }
48 inline __m256d _v256_permute2x128(
const __m256d& a,
const __m256d& b)
49 {
return _mm256_permute2f128_pd(a, b, imm); }
51 template<
int imm,
typename _Tpvec>
52 inline _Tpvec v256_permute2x128(
const _Tpvec& a,
const _Tpvec& b)
53 {
return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
56 inline __m256i _v256_permute4x64(
const __m256i& a)
57 {
return _mm256_permute4x64_epi64(a, imm); }
60 inline __m256d _v256_permute4x64(
const __m256d& a)
61 {
return _mm256_permute4x64_pd(a, imm); }
63 template<
int imm,
typename _Tpvec>
64 inline _Tpvec v256_permute4x64(
const _Tpvec& a)
65 {
return _Tpvec(_v256_permute4x64<imm>(a.val)); }
67 inline __m128i _v256_extract_high(
const __m256i& v)
68 {
return _mm256_extracti128_si256(v, 1); }
70 inline __m128 _v256_extract_high(
const __m256& v)
71 {
return _mm256_extractf128_ps(v, 1); }
73 inline __m128d _v256_extract_high(
const __m256d& v)
74 {
return _mm256_extractf128_pd(v, 1); }
76 inline __m128i _v256_extract_low(
const __m256i& v)
77 {
return _mm256_castsi256_si128(v); }
79 inline __m128 _v256_extract_low(
const __m256& v)
80 {
return _mm256_castps256_ps128(v); }
82 inline __m128d _v256_extract_low(
const __m256d& v)
83 {
return _mm256_castpd256_pd128(v); }
85 inline __m256i _v256_packs_epu32(
const __m256i& a,
const __m256i& b)
87 const __m256i m = _mm256_set1_epi32(65535);
88 __m256i am = _mm256_min_epu32(a, m);
89 __m256i bm = _mm256_min_epu32(b, m);
90 return _mm256_packus_epi32(am, bm);
94 inline int _v256_extract_epi8(
const __m256i& a)
96 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910))
97 return _mm256_extract_epi8(a, i);
99 __m128i b = _mm256_extractf128_si256(a, ((i) >> 4));
100 return _mm_extract_epi8(b, i & 15);
105 inline int _v256_extract_epi16(
const __m256i& a)
107 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910))
108 return _mm256_extract_epi16(a, i);
110 __m128i b = _mm256_extractf128_si256(a, ((i) >> 3));
111 return _mm_extract_epi16(b, i & 7);
116 inline int _v256_extract_epi32(
const __m256i& a)
118 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910))
119 return _mm256_extract_epi32(a, i);
121 __m128i b = _mm256_extractf128_si256(a, ((i) >> 2));
122 return _mm_extract_epi32(b, i & 3);
127 inline int64 _v256_extract_epi64(
const __m256i& a)
129 #if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910))
130 return _mm256_extract_epi64(a, i);
132 __m128i b = _mm256_extractf128_si256(a, ((i) >> 1));
133 return _mm_extract_epi64(b, i & 1);
141 typedef uchar lane_type;
142 enum { nlanes = 32 };
145 explicit v_uint8x32(__m256i v) : val(v) {}
155 val = _mm256_setr_epi8((
char)v0, (
char)v1, (
char)v2, (
char)v3,
156 (
char)v4, (
char)v5, (
char)v6 , (
char)v7, (
char)v8, (
char)v9,
157 (
char)v10, (
char)v11, (
char)v12, (
char)v13, (
char)v14, (
char)v15,
158 (
char)v16, (
char)v17, (
char)v18, (
char)v19, (
char)v20, (
char)v21,
159 (
char)v22, (
char)v23, (
char)v24, (
char)v25, (
char)v26, (
char)v27,
160 (
char)v28, (
char)v29, (
char)v30, (
char)v31);
165 uchar get0()
const {
return (
uchar)_v_cvtsi256_si32(val); }
170 typedef schar lane_type;
171 enum { nlanes = 32 };
174 explicit v_int8x32(__m256i v) : val(v) {}
184 val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
185 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
186 v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
191 schar get0()
const {
return (
schar)_v_cvtsi256_si32(val); }
197 enum { nlanes = 16 };
200 explicit v_uint16x16(__m256i v) : val(v) {}
206 val = _mm256_setr_epi16((
short)v0, (
short)v1, (
short)v2, (
short)v3,
207 (
short)v4, (
short)v5, (
short)v6, (
short)v7, (
short)v8, (
short)v9,
208 (
short)v10, (
short)v11, (
short)v12, (
short)v13, (
short)v14, (
short)v15);
213 ushort get0()
const {
return (
ushort)_v_cvtsi256_si32(val); }
218 typedef short lane_type;
219 enum { nlanes = 16 };
222 explicit v_int16x16(__m256i v) : val(v) {}
223 v_int16x16(
short v0,
short v1,
short v2,
short v3,
224 short v4,
short v5,
short v6,
short v7,
225 short v8,
short v9,
short v10,
short v11,
226 short v12,
short v13,
short v14,
short v15)
228 val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
229 v8, v9, v10, v11, v12, v13, v14, v15);
234 short get0()
const {
return (
short)_v_cvtsi256_si32(val); }
239 typedef unsigned lane_type;
243 explicit v_uint32x8(__m256i v) : val(v) {}
244 v_uint32x8(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3,
245 unsigned v4,
unsigned v5,
unsigned v6,
unsigned v7)
247 val = _mm256_setr_epi32((
unsigned)v0, (
unsigned)v1, (
unsigned)v2,
248 (
unsigned)v3, (
unsigned)v4, (
unsigned)v5, (
unsigned)v6, (
unsigned)v7);
253 unsigned get0()
const {
return (
unsigned)_v_cvtsi256_si32(val); }
258 typedef int lane_type;
262 explicit v_int32x8(__m256i v) : val(v) {}
263 v_int32x8(
int v0,
int v1,
int v2,
int v3,
264 int v4,
int v5,
int v6,
int v7)
266 val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
271 int get0()
const {
return _v_cvtsi256_si32(val); }
276 typedef float lane_type;
280 explicit v_float32x8(__m256 v) : val(v) {}
281 v_float32x8(
float v0,
float v1,
float v2,
float v3,
282 float v4,
float v5,
float v6,
float v7)
284 val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
289 float get0()
const {
return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
298 explicit v_uint64x4(__m256i v) : val(v) {}
306 #if defined __x86_64__ || defined _M_X64
307 return (
uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
309 int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
310 int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
311 return (
unsigned)a | ((
uint64)(
unsigned)b << 32);
318 typedef int64 lane_type;
322 explicit v_int64x4(__m256i v) : val(v) {}
324 { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
330 #if defined __x86_64__ || defined _M_X64
331 return (
int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
333 int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
334 int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
335 return (
int64)((unsigned)a | ((
uint64)(unsigned)b << 32));
342 typedef double lane_type;
346 explicit v_float64x4(__m256d v) : val(v) {}
347 v_float64x4(
double v0,
double v1,
double v2,
double v3)
348 { val = _mm256_setr_pd(v0, v1, v2, v3); }
352 double get0()
const {
return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
357 #define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp) \
358 inline _Tpvec v256_load(const _Tp* ptr) \
359 { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); } \
360 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
361 { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); } \
362 inline _Tpvec v256_load_low(const _Tp* ptr) \
364 __m128i v128 = _mm_loadu_si128((const __m128i*)ptr); \
365 return _Tpvec(_mm256_castsi128_si256(v128)); \
367 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
369 __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0); \
370 __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1); \
371 return _Tpvec(_v256_combine(vlo, vhi)); \
373 inline void v_store(_Tp* ptr, const _Tpvec& a) \
374 { _mm256_storeu_si256((__m256i*)ptr, a.val); } \
375 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
376 { _mm256_store_si256((__m256i*)ptr, a.val); } \
377 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
378 { _mm256_stream_si256((__m256i*)ptr, a.val); } \
379 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
381 if( mode == hal::STORE_UNALIGNED ) \
382 _mm256_storeu_si256((__m256i*)ptr, a.val); \
383 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
384 _mm256_stream_si256((__m256i*)ptr, a.val); \
386 _mm256_store_si256((__m256i*)ptr, a.val); \
388 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
389 { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \
390 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
391 { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
393 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32,
uchar)
394 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32,
schar)
395 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16,
ushort)
396 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16,
short)
397 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8,
unsigned)
398 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8,
int)
399 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4,
uint64)
400 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4,
int64)
402 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
403 inline _Tpvec v256_load(const _Tp* ptr) \
404 { return _Tpvec(_mm256_loadu_##suffix(ptr)); } \
405 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
406 { return _Tpvec(_mm256_load_##suffix(ptr)); } \
407 inline _Tpvec v256_load_low(const _Tp* ptr) \
409 return _Tpvec(_mm256_cast##suffix##128_##suffix##256 \
410 (_mm_loadu_##suffix(ptr))); \
412 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
414 halfreg vlo = _mm_loadu_##suffix(ptr0); \
415 halfreg vhi = _mm_loadu_##suffix(ptr1); \
416 return _Tpvec(_v256_combine(vlo, vhi)); \
418 inline void v_store(_Tp* ptr, const _Tpvec& a) \
419 { _mm256_storeu_##suffix(ptr, a.val); } \
420 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
421 { _mm256_store_##suffix(ptr, a.val); } \
422 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
423 { _mm256_stream_##suffix(ptr, a.val); } \
424 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
426 if( mode == hal::STORE_UNALIGNED ) \
427 _mm256_storeu_##suffix(ptr, a.val); \
428 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
429 _mm256_stream_##suffix(ptr, a.val); \
431 _mm256_store_##suffix(ptr, a.val); \
433 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
434 { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \
435 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
436 { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
438 OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8,
float, ps, __m128)
439 OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4,
double, pd, __m128d)
441 #define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
442 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
443 { return _Tpvec(cast(a.val)); }
445 #define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
446 inline _Tpvec v256_setzero_##suffix() \
447 { return _Tpvec(_mm256_setzero_si256()); } \
448 inline _Tpvec v256_setall_##suffix(_Tp v) \
449 { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \
450 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
451 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
452 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
453 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
454 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
455 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
456 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
457 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
458 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256) \
459 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
461 OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32,
uchar, u8, epi8,
char)
462 OPENCV_HAL_IMPL_AVX_INIT(v_int8x32,
schar, s8, epi8,
char)
463 OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16,
ushort, u16, epi16,
short)
464 OPENCV_HAL_IMPL_AVX_INIT(v_int16x16,
short, s16, epi16,
short)
465 OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8,
unsigned, u32, epi32,
int)
466 OPENCV_HAL_IMPL_AVX_INIT(v_int32x8,
int, s32, epi32,
int)
467 OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4,
uint64, u64, epi64x,
int64)
468 OPENCV_HAL_IMPL_AVX_INIT(v_int64x4,
int64, s64, epi64x,
int64)
470 #define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
471 inline _Tpvec v256_setzero_##suffix() \
472 { return _Tpvec(_mm256_setzero_##zsuffix()); } \
473 inline _Tpvec v256_setall_##suffix(_Tp v) \
474 { return _Tpvec(_mm256_set1_##zsuffix(v)); } \
475 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
476 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \
477 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
478 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, cast) \
479 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
480 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, cast) \
481 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
482 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, cast)
484 OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8,
float, f32, ps, _mm256_castsi256_ps)
485 OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4,
double, f64, pd, _mm256_castsi256_pd)
487 inline v_float32x8 v_reinterpret_as_f32(
const v_float32x8& a)
489 inline v_float32x8 v_reinterpret_as_f32(
const v_float64x4& a)
490 {
return v_float32x8(_mm256_castpd_ps(a.val)); }
492 inline v_float64x4 v_reinterpret_as_f64(
const v_float64x4& a)
494 inline v_float64x4 v_reinterpret_as_f64(
const v_float32x8& a)
495 {
return v_float64x4(_mm256_castps_pd(a.val)); }
547 #define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix) \
548 inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
549 { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); } \
550 inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
551 { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
553 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32, epi8)
554 OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32, epi8)
555 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
556 OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16, epi16)
557 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8, epi32)
558 OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8, epi32)
559 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4, epi64)
560 OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4, epi64)
561 OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
562 OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
565 #define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix) \
567 inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b) \
568 { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
570 OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
571 OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16, epi16)
572 OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8, epi32)
573 OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8, epi32)
574 OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
575 OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
578 inline v_uint64x4 v256_blend(
const v_uint64x4& a,
const v_uint64x4& b)
581 enum {M1 = (M0 | (M0 << 2)) & 0x33};
582 enum {M2 = (M1 | (M1 << 1)) & 0x55};
583 enum {MM = M2 | (M2 << 1)};
584 return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
587 inline v_int64x4 v256_blend(
const v_int64x4& a,
const v_int64x4& b)
588 {
return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
592 #define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin) \
594 inline _Tpvec v256_shuffle(const _Tpvec& a) \
595 { return _Tpvec(_mm256_##intrin(a.val, m)); }
597 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8, shuffle_epi32)
598 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8, shuffle_epi32)
599 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
600 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
602 template<
typename _Tpvec>
603 inline void v256_zip(
const _Tpvec& a,
const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
605 ab0 = v256_unpacklo(a, b);
606 ab1 = v256_unpackhi(a, b);
609 template<
typename _Tpvec>
610 inline _Tpvec v256_combine_diagonal(
const _Tpvec& a,
const _Tpvec& b)
611 {
return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }
613 inline v_float32x8 v256_combine_diagonal(
const v_float32x8& a,
const v_float32x8& b)
614 {
return v256_blend<0xf0>(a, b); }
616 inline v_float64x4 v256_combine_diagonal(
const v_float64x4& a,
const v_float64x4& b)
617 {
return v256_blend<0xc>(a, b); }
619 template<
typename _Tpvec>
620 inline _Tpvec v256_alignr_128(
const _Tpvec& a,
const _Tpvec& b)
621 {
return v256_permute2x128<0x21>(a, b); }
623 template<
typename _Tpvec>
624 inline _Tpvec v256_alignr_64(
const _Tpvec& a,
const _Tpvec& b)
625 {
return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
626 inline v_float64x4 v256_alignr_64(
const v_float64x4& a,
const v_float64x4& b)
627 {
return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
630 template<
typename _Tpvec>
631 inline _Tpvec v256_swap_halves(
const _Tpvec& a)
632 {
return v256_permute2x128<1>(a, a); }
634 template<
typename _Tpvec>
635 inline _Tpvec v256_reverse_64(
const _Tpvec& a)
636 {
return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
639 #define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec) \
640 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
641 { return v256_permute2x128<0x20>(a, b); } \
642 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
643 { return v256_permute2x128<0x31>(a, b); } \
644 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
645 _Tpvec& c, _Tpvec& d) \
647 _Tpvec a1b0 = v256_alignr_128(a, b); \
648 c = v256_combine_diagonal(a, a1b0); \
649 d = v256_combine_diagonal(a1b0, b); \
651 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
652 _Tpvec& ab0, _Tpvec& ab1) \
654 _Tpvec ab0ab2, ab1ab3; \
655 v256_zip(a, b, ab0ab2, ab1ab3); \
656 v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
659 OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
660 OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
661 OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
662 OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
663 OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
664 OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
665 OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
666 OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
667 OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
668 OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
675 #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
676 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
677 { return _Tpvec(intrin(a.val, b.val)); } \
678 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
679 { a.val = intrin(a.val, b.val); return a; }
681 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
682 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
683 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
684 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
685 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
686 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
687 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
688 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
689 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
690 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
691 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
692 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
693 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
694 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
695 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
696 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
697 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
698 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
700 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
701 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
702 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
703 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
704 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
705 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
706 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
707 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
710 inline v_uint8x32
operator * (
const v_uint8x32& a,
const v_uint8x32& b)
716 inline v_int8x32
operator * (
const v_int8x32& a,
const v_int8x32& b)
722 inline v_uint16x16
operator * (
const v_uint16x16& a,
const v_uint16x16& b)
724 __m256i pl = _mm256_mullo_epi16(a.val, b.val);
725 __m256i ph = _mm256_mulhi_epu16(a.val, b.val);
726 __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
727 __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
728 return v_uint16x16(_v256_packs_epu32(p0, p1));
730 inline v_int16x16
operator * (
const v_int16x16& a,
const v_int16x16& b)
732 __m256i pl = _mm256_mullo_epi16(a.val, b.val);
733 __m256i ph = _mm256_mulhi_epi16(a.val, b.val);
734 __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
735 __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
736 return v_int16x16(_mm256_packs_epi32(p0, p1));
738 inline v_uint8x32&
operator *= (v_uint8x32& a,
const v_uint8x32& b)
739 { a = a * b;
return a; }
740 inline v_int8x32&
operator *= (v_int8x32& a,
const v_int8x32& b)
741 { a = a * b;
return a; }
742 inline v_uint16x16&
operator *= (v_uint16x16& a,
const v_uint16x16& b)
743 { a = a * b;
return a; }
744 inline v_int16x16&
operator *= (v_int16x16& a,
const v_int16x16& b)
745 { a = a * b;
return a; }
748 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
749 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
750 { return _Tpvec(intrin(a.val, b.val)); }
752 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
753 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
754 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
755 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
756 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
757 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
758 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
759 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
760 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
761 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16, _mm256_mullo_epi16)
763 inline v_uint8x32 v_mul_wrap(
const v_uint8x32& a,
const v_uint8x32& b)
765 __m256i ad = _mm256_srai_epi16(a.val, 8);
766 __m256i bd = _mm256_srai_epi16(b.val, 8);
767 __m256i p0 = _mm256_mullo_epi16(a.val, b.val);
768 __m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8);
770 const __m256i b01 = _mm256_set1_epi32(0xFF00FF00);
771 return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
773 inline v_int8x32 v_mul_wrap(
const v_int8x32& a,
const v_int8x32& b)
775 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
779 inline void v_mul_expand(
const v_uint8x32& a,
const v_uint8x32& b,
780 v_uint16x16& c, v_uint16x16& d)
782 v_uint16x16 a0, a1, b0, b1;
785 c = v_mul_wrap(a0, b0);
786 d = v_mul_wrap(a1, b1);
789 inline void v_mul_expand(
const v_int8x32& a,
const v_int8x32& b,
790 v_int16x16& c, v_int16x16& d)
792 v_int16x16 a0, a1, b0, b1;
795 c = v_mul_wrap(a0, b0);
796 d = v_mul_wrap(a1, b1);
799 inline void v_mul_expand(
const v_int16x16& a,
const v_int16x16& b,
800 v_int32x8& c, v_int32x8& d)
802 v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
805 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
807 c = v_reinterpret_as_s32(v0);
808 d = v_reinterpret_as_s32(v1);
811 inline void v_mul_expand(
const v_uint16x16& a,
const v_uint16x16& b,
812 v_uint32x8& c, v_uint32x8& d)
814 v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
817 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
819 c = v_reinterpret_as_u32(v0);
820 d = v_reinterpret_as_u32(v1);
823 inline void v_mul_expand(
const v_uint32x8& a,
const v_uint32x8& b,
824 v_uint64x4& c, v_uint64x4& d)
826 __m256i v0 = _mm256_mul_epu32(a.val, b.val);
827 __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
828 v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
831 inline v_int16x16
v_mul_hi(
const v_int16x16& a,
const v_int16x16& b) {
return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
832 inline v_uint16x16
v_mul_hi(
const v_uint16x16& a,
const v_uint16x16& b) {
return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
835 #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
836 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
837 { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
838 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
839 { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
840 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
841 { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
842 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
843 { return _Tpsvec(srai(a.val, imm)); } \
845 inline _Tpuvec v_shl(const _Tpuvec& a) \
846 { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
848 inline _Tpsvec v_shl(const _Tpsvec& a) \
849 { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
851 inline _Tpuvec v_shr(const _Tpuvec& a) \
852 { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
854 inline _Tpsvec v_shr(const _Tpsvec& a) \
855 { return _Tpsvec(srai(a.val, imm)); }
857 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
858 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8, v_int32x8, epi32, _mm256_srai_epi32)
860 inline __m256i _mm256_srai_epi64xx(
const __m256i a,
int imm)
862 __m256i d = _mm256_set1_epi64x((
int64)1 << 63);
863 __m256i
r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
864 return _mm256_sub_epi64(
r, _mm256_srli_epi64(d, imm));
866 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx)
870 #define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
871 OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
872 OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
873 OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
874 inline _Tpvec operator ~ (const _Tpvec& a) \
875 { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
877 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
878 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32, si256, _mm256_set1_epi32(-1))
879 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16, si256, _mm256_set1_epi32(-1))
880 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16, si256, _mm256_set1_epi32(-1))
881 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8, si256, _mm256_set1_epi32(-1))
882 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8, si256, _mm256_set1_epi32(-1))
883 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4, si256, _mm256_set1_epi64x(-1))
884 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4, si256, _mm256_set1_epi64x(-1))
885 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8, ps, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
886 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4, pd, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
889 #define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix) \
890 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
891 { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
893 OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32, epi8)
894 OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32, epi8)
895 OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
896 OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16, epi8)
897 OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8, epi8)
898 OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8, epi8)
899 OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
900 OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
903 #define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
904 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
905 { return ~(a == b); } \
906 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
908 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
909 { return ~(a < b); } \
910 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
913 #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
914 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
915 { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
916 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
918 __m256i smask = _mm256_set1_##suffix(sbit); \
919 return _Tpuvec(_mm256_cmpgt_##suffix( \
920 _mm256_xor_si256(a.val, smask), \
921 _mm256_xor_si256(b.val, smask))); \
923 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
924 { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
925 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
926 { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
927 OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
928 OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
930 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32, v_int8x32, epi8, (
char)-128)
931 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (
short)-32768)
932 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (
int)0x80000000)
934 #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
935 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
936 { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
937 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
938 { return ~(a == b); }
940 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
941 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
943 #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
944 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
945 { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
947 #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
948 OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
949 OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
950 OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
951 OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
952 OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
953 OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
955 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
956 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
958 inline v_float32x8
v_not_nan(
const v_float32x8& a)
959 {
return v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }
960 inline v_float64x4
v_not_nan(
const v_float64x4& a)
961 {
return v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }
964 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8)
965 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8)
966 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32, _mm256_min_epi8)
967 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32, _mm256_max_epi8)
968 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
969 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
970 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16, _mm256_min_epi16)
971 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16, _mm256_max_epi16)
972 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8, _mm256_min_epu32)
973 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8, _mm256_max_epu32)
974 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8, _mm256_min_epi32)
975 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8, _mm256_max_epi32)
976 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
977 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
978 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
979 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
983 inline v_uint8x32 v_rotate_left(
const v_uint8x32& a,
const v_uint8x32& b)
985 enum {IMM_R = (16 - imm) & 0xFF};
986 enum {IMM_R2 = (32 - imm) & 0xFF};
988 if (imm == 0)
return a;
989 if (imm == 32)
return b;
990 if (imm > 32)
return v_uint8x32();
992 __m256i
swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
993 if (imm == 16)
return v_uint8x32(
swap);
994 if (imm < 16)
return v_uint8x32(_mm256_alignr_epi8(a.val,
swap, IMM_R));
995 return v_uint8x32(_mm256_alignr_epi8(
swap, b.val, IMM_R2));
999 inline v_uint8x32 v_rotate_right(
const v_uint8x32& a,
const v_uint8x32& b)
1001 enum {IMM_L = (imm - 16) & 0xFF};
1003 if (imm == 0)
return a;
1004 if (imm == 32)
return b;
1005 if (imm > 32)
return v_uint8x32();
1007 __m256i
swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
1008 if (imm == 16)
return v_uint8x32(
swap);
1009 if (imm < 16)
return v_uint8x32(_mm256_alignr_epi8(
swap, a.val, imm));
1010 return v_uint8x32(_mm256_alignr_epi8(b.val,
swap, IMM_L));
1014 inline v_uint8x32 v_rotate_left(
const v_uint8x32& a)
1016 enum {IMM_L = (imm - 16) & 0xFF};
1017 enum {IMM_R = (16 - imm) & 0xFF};
1019 if (imm == 0)
return a;
1020 if (imm > 32)
return v_uint8x32();
1023 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
1024 if (imm == 16)
return v_uint8x32(swapz);
1025 if (imm < 16)
return v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));
1026 return v_uint8x32(_mm256_slli_si256(swapz, IMM_L));
1030 inline v_uint8x32 v_rotate_right(
const v_uint8x32& a)
1032 enum {IMM_L = (imm - 16) & 0xFF};
1034 if (imm == 0)
return a;
1035 if (imm > 32)
return v_uint8x32();
1038 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
1039 if (imm == 16)
return v_uint8x32(swapz);
1040 if (imm < 16)
return v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));
1041 return v_uint8x32(_mm256_srli_si256(swapz, IMM_L));
1044 #define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \
1046 inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
1048 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1049 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a), \
1050 v_reinterpret_as_u8(b)); \
1051 return _Tpvec(cast(ret.val)); \
1054 inline _Tpvec intrin(const _Tpvec& a) \
1056 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1057 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a)); \
1058 return _Tpvec(cast(ret.val)); \
1061 #define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec) \
1062 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
1063 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
1065 OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
1066 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
1067 OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
1068 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
1069 OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
1070 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
1071 OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
1073 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float32x8, _mm256_castsi256_ps)
1074 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
1075 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd)
1076 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
1079 inline v_uint8x32
v_reverse(
const v_uint8x32 &a)
1081 static const __m256i perm = _mm256_setr_epi8(
1082 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
1083 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1084 __m256i vec = _mm256_shuffle_epi8(a.val, perm);
1085 return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
1088 inline v_int8x32
v_reverse(
const v_int8x32 &a)
1089 {
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1091 inline v_uint16x16
v_reverse(
const v_uint16x16 &a)
1093 static const __m256i perm = _mm256_setr_epi8(
1094 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
1095 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1096 __m256i vec = _mm256_shuffle_epi8(a.val, perm);
1097 return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
1100 inline v_int16x16
v_reverse(
const v_int16x16 &a)
1101 {
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
1103 inline v_uint32x8
v_reverse(
const v_uint32x8 &a)
1105 static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1106 return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
1109 inline v_int32x8
v_reverse(
const v_int32x8 &a)
1110 {
return v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
1112 inline v_float32x8
v_reverse(
const v_float32x8 &a)
1113 {
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
1115 inline v_uint64x4
v_reverse(
const v_uint64x4 &a)
1117 return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
1120 inline v_int64x4
v_reverse(
const v_int64x4 &a)
1121 {
return v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
1123 inline v_float64x4
v_reverse(
const v_float64x4 &a)
1124 {
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
1131 __m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
1132 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1133 return (
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1137 __m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((
schar)-128)), _mm256_setzero_si256());
1138 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1139 return (
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
1141 #define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
1142 inline sctype v_reduce_##func(const _Tpvec& a) \
1144 __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
1145 val = intrin(val, _mm_srli_si128(val,8)); \
1146 val = intrin(val, _mm_srli_si128(val,4)); \
1147 val = intrin(val, _mm_srli_si128(val,2)); \
1148 val = intrin(val, _mm_srli_si128(val,1)); \
1149 return (sctype)_mm_cvtsi128_si32(val); \
1152 OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32,
uchar,
min, _mm_min_epu8)
1153 OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,
schar,
min, _mm_min_epi8)
1154 OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32,
uchar,
max, _mm_max_epu8)
1155 OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,
schar,
max, _mm_max_epi8)
1157 #define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1158 inline sctype v_reduce_##func(const _Tpvec& a) \
1160 __m128i v0 = _v256_extract_low(a.val); \
1161 __m128i v1 = _v256_extract_high(a.val); \
1162 v0 = intrin(v0, v1); \
1163 v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
1164 v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
1165 v0 = intrin(v0, _mm_srli_si128(v0, 2)); \
1166 return (sctype) _mm_cvtsi128_si32(v0); \
1169 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16,
ushort,
min, _mm_min_epu16)
1170 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,
short,
min, _mm_min_epi16)
1171 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16,
ushort,
max, _mm_max_epu16)
1172 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,
short,
max, _mm_max_epi16)
1174 #define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1175 inline sctype v_reduce_##func(const _Tpvec& a) \
1177 __m128i v0 = _v256_extract_low(a.val); \
1178 __m128i v1 = _v256_extract_high(a.val); \
1179 v0 = intrin(v0, v1); \
1180 v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
1181 v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
1182 return (sctype) _mm_cvtsi128_si32(v0); \
1185 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8,
unsigned,
min, _mm_min_epu32)
1186 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,
int,
min, _mm_min_epi32)
1187 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8,
unsigned,
max, _mm_max_epu32)
1188 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,
int,
max, _mm_max_epi32)
1190 #define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin) \
1191 inline float v_reduce_##func(const v_float32x8& a) \
1193 __m128 v0 = _v256_extract_low(a.val); \
1194 __m128 v1 = _v256_extract_high(a.val); \
1195 v0 = intrin(v0, v1); \
1196 v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
1197 v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 1))); \
1198 return _mm_cvtss_f32(v0); \
1201 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(
min, _mm_min_ps)
1202 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(
max, _mm_max_ps)
1206 __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
1207 s0 = _mm256_hadd_epi32(s0, s0);
1209 __m128i s1 = _v256_extract_high(s0);
1210 s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
1212 return _mm_cvtsi128_si32(s1);
1225 __m256 s0 = _mm256_hadd_ps(a.val, a.val);
1226 s0 = _mm256_hadd_ps(s0, s0);
1228 __m128 s1 = _v256_extract_high(s0);
1229 s1 = _mm_add_ps(_v256_extract_low(s0), s1);
1231 return _mm_cvtss_f32(s1);
1237 _mm_store_si128((__m128i*)
idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
1243 _mm_store_si128((__m128i*)
idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
1248 __m256d s0 = _mm256_hadd_pd(a.val, a.val);
1249 return _mm_cvtsd_f64(_mm_add_pd(_v256_extract_low(s0), _v256_extract_high(s0)));
1252 inline v_float32x8
v_reduce_sum4(
const v_float32x8& a,
const v_float32x8& b,
1253 const v_float32x8& c,
const v_float32x8& d)
1255 __m256 ab = _mm256_hadd_ps(a.val, b.val);
1256 __m256 cd = _mm256_hadd_ps(c.val, d.val);
1257 return v_float32x8(_mm256_hadd_ps(ab, cd));
1260 inline unsigned v_reduce_sad(
const v_uint8x32& a,
const v_uint8x32& b)
1262 __m256i half = _mm256_sad_epu8(a.val, b.val);
1263 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1264 return (
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1266 inline unsigned v_reduce_sad(
const v_int8x32& a,
const v_int8x32& b)
1268 __m256i half = _mm256_set1_epi8(0x7f);
1269 half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half));
1270 __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
1271 return (
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1273 inline unsigned v_reduce_sad(
const v_uint16x16& a,
const v_uint16x16& b)
1276 v_expand(v_add_wrap(a - b, b - a), l, h);
1279 inline unsigned v_reduce_sad(
const v_int16x16& a,
const v_int16x16& b)
1282 v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
1285 inline unsigned v_reduce_sad(
const v_uint32x8& a,
const v_uint32x8& b)
1289 inline unsigned v_reduce_sad(
const v_int32x8& a,
const v_int32x8& b)
1291 v_int32x8 m = a < b;
1292 return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
1294 inline float v_reduce_sad(
const v_float32x8& a,
const v_float32x8& b)
1296 return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
1300 inline v_uint8x32
v_popcount(
const v_uint8x32& a)
1302 __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1303 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
1304 __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
1305 return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256( a.val , _popcnt_mask)),
1306 _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
1308 inline v_uint16x16
v_popcount(
const v_uint16x16& a)
1310 v_uint8x32 p =
v_popcount(v_reinterpret_as_u8(a));
1311 p += v_rotate_right<1>(p);
1312 return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
1314 inline v_uint32x8
v_popcount(
const v_uint32x8& a)
1316 v_uint8x32 p =
v_popcount(v_reinterpret_as_u8(a));
1317 p += v_rotate_right<1>(p);
1318 p += v_rotate_right<2>(p);
1319 return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
1321 inline v_uint64x4
v_popcount(
const v_uint64x4& a)
1323 return v_uint64x4(_mm256_sad_epu8(
v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
1325 inline v_uint8x32
v_popcount(
const v_int8x32& a)
1326 {
return v_popcount(v_reinterpret_as_u8(a)); }
1327 inline v_uint16x16
v_popcount(
const v_int16x16& a)
1328 {
return v_popcount(v_reinterpret_as_u16(a)); }
1329 inline v_uint32x8
v_popcount(
const v_int32x8& a)
1330 {
return v_popcount(v_reinterpret_as_u32(a)); }
1331 inline v_uint64x4
v_popcount(
const v_int64x4& a)
1332 {
return v_popcount(v_reinterpret_as_u64(a)); }
1336 {
return _mm256_movemask_epi8(a.val); }
1338 {
return v_signmask(v_reinterpret_as_s8(a)); }
1341 {
return v_signmask(v_pack(a, a)) & 0xFFFF; }
1343 {
return v_signmask(v_reinterpret_as_s16(a)); }
1346 {
return _mm256_movemask_ps(a.val); }
1348 {
return _mm256_movemask_pd(a.val); }
1351 {
return v_signmask(v_reinterpret_as_f32(a)); }
1353 {
return v_signmask(v_reinterpret_as_f32(a)); }
1356 {
return v_signmask(v_reinterpret_as_f64(a)); }
1358 {
return v_signmask(v_reinterpret_as_f64(a)); }
1363 inline int v_scan_forward(
const v_uint16x16& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s8(a))) / 2; }
1366 inline int v_scan_forward(
const v_float32x8& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s8(a))) / 4; }
1369 inline int v_scan_forward(
const v_float64x4& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s8(a))) / 8; }
1372 #define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \
1373 inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1374 inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1375 OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1)
1376 OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1)
1377 OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255)
1378 OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255)
1379 OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15)
1380 OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15)
1381 OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255)
1382 OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15)
1384 #define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec) \
1385 inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
1386 inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
1387 OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16)
1388 OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
1394 #define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
1395 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1396 { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1397 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1398 { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }
1400 #define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
1401 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1402 { return _Tpvec(_mm256_add_##suffix(_mm256_mul_##suffix(a.val, b.val), c.val)); } \
1403 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1404 { return _Tpvec(_mm256_add_##suffix(_mm256_mul_##suffix(a.val, b.val), c.val)); }
1407 #define OPENCV_HAL_IMPL_AVX_MISC(_Tpvec, suffix) \
1408 inline _Tpvec v_sqrt(const _Tpvec& x) \
1409 { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
1410 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1411 { return v_fma(a, a, b * b); } \
1412 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1413 { return v_sqrt(v_fma(a, a, b*b)); }
1415 OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
1416 OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
1417 OPENCV_HAL_IMPL_AVX_MISC(v_float32x8, ps)
1418 OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)
1420 inline v_int32x8
v_fma(
const v_int32x8& a,
const v_int32x8& b,
const v_int32x8& c)
1425 inline v_int32x8
v_muladd(
const v_int32x8& a,
const v_int32x8& b,
const v_int32x8& c)
1427 return v_fma(a, b, c);
1430 inline v_float32x8
v_invsqrt(
const v_float32x8&
x)
1432 v_float32x8 half =
x * v256_setall_f32(0.5);
1433 v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(
x.val));
1435 t *= v256_setall_f32(1.5) - ((t * t) * half);
1439 inline v_float64x4
v_invsqrt(
const v_float64x4&
x)
1441 return v256_setall_f64(1.) / v_sqrt(
x);
1445 #define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix) \
1446 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1447 { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
1449 OPENCV_HAL_IMPL_AVX_ABS(int8x32, epi8)
1450 OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
1451 OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
1453 inline v_float32x8 v_abs(
const v_float32x8&
x)
1454 {
return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
1455 inline v_float64x4 v_abs(
const v_float64x4&
x)
1456 {
return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
1459 inline v_uint8x32
v_absdiff(
const v_uint8x32& a,
const v_uint8x32& b)
1460 {
return v_add_wrap(a - b, b - a); }
1461 inline v_uint16x16
v_absdiff(
const v_uint16x16& a,
const v_uint16x16& b)
1462 {
return v_add_wrap(a - b, b - a); }
1463 inline v_uint32x8
v_absdiff(
const v_uint32x8& a,
const v_uint32x8& b)
1464 {
return v_max(a, b) - v_min(a, b); }
1466 inline v_uint8x32
v_absdiff(
const v_int8x32& a,
const v_int8x32& b)
1468 v_int8x32 d = v_sub_wrap(a, b);
1469 v_int8x32 m = a < b;
1470 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1473 inline v_uint16x16
v_absdiff(
const v_int16x16& a,
const v_int16x16& b)
1474 {
return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1476 inline v_uint32x8
v_absdiff(
const v_int32x8& a,
const v_int32x8& b)
1478 v_int32x8 d = a - b;
1479 v_int32x8 m = a < b;
1480 return v_reinterpret_as_u32((d ^ m) - m);
1483 inline v_float32x8
v_absdiff(
const v_float32x8& a,
const v_float32x8& b)
1484 {
return v_abs(a - b); }
1486 inline v_float64x4
v_absdiff(
const v_float64x4& a,
const v_float64x4& b)
1487 {
return v_abs(a - b); }
1490 inline v_int8x32
v_absdiffs(
const v_int8x32& a,
const v_int8x32& b)
1492 v_int8x32 d = a - b;
1493 v_int8x32 m = a < b;
1496 inline v_int16x16
v_absdiffs(
const v_int16x16& a,
const v_int16x16& b)
1497 {
return v_max(a, b) - v_min(a, b); }
1502 inline v_int32x8
v_round(
const v_float32x8& a)
1503 {
return v_int32x8(_mm256_cvtps_epi32(a.val)); }
1505 inline v_int32x8
v_round(
const v_float64x4& a)
1506 {
return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
1508 inline v_int32x8
v_round(
const v_float64x4& a,
const v_float64x4& b)
1510 __m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
1511 return v_int32x8(_v256_combine(ai, bi));
1514 inline v_int32x8
v_trunc(
const v_float32x8& a)
1515 {
return v_int32x8(_mm256_cvttps_epi32(a.val)); }
1517 inline v_int32x8
v_trunc(
const v_float64x4& a)
1518 {
return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
1520 inline v_int32x8
v_floor(
const v_float32x8& a)
1521 {
return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
1523 inline v_int32x8
v_floor(
const v_float64x4& a)
1524 {
return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
1526 inline v_int32x8
v_ceil(
const v_float32x8& a)
1527 {
return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
1529 inline v_int32x8
v_ceil(
const v_float64x4& a)
1530 {
return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
1533 inline v_float32x8
v_cvt_f32(
const v_int32x8& a)
1534 {
return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
1536 inline v_float32x8
v_cvt_f32(
const v_float64x4& a)
1537 {
return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
1539 inline v_float32x8
v_cvt_f32(
const v_float64x4& a,
const v_float64x4& b)
1541 __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
1542 return v_float32x8(_v256_combine(af, bf));
1545 inline v_float64x4
v_cvt_f64(
const v_int32x8& a)
1546 {
return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
1549 {
return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
1551 inline v_float64x4
v_cvt_f64(
const v_float32x8& a)
1552 {
return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
1555 {
return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
1558 inline v_float64x4
v_cvt_f64(
const v_int64x4& v)
1561 __m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000);
1562 __m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000);
1563 __m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000);
1564 __m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
1567 __m256i v_lo = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
1569 __m256i v_hi = _mm256_srli_epi64(v.val, 32);
1571 v_hi = _mm256_xor_si256(v_hi, magic_i_hi32);
1573 __m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
1575 __m256d
result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
1576 return v_float64x4(
result);
1581 inline v_int8x32 v256_lut(
const schar* tab,
const int*
idx)
1583 return v_int8x32(_mm256_setr_epi8(tab[
idx[ 0]], tab[
idx[ 1]], tab[
idx[ 2]], tab[
idx[ 3]], tab[
idx[ 4]], tab[
idx[ 5]], tab[
idx[ 6]], tab[
idx[ 7]],
1588 inline v_int8x32 v256_lut_pairs(
const schar* tab,
const int*
idx)
1590 return v_int8x32(_mm256_setr_epi16(*(
const short*)(tab +
idx[ 0]), *(
const short*)(tab +
idx[ 1]), *(
const short*)(tab +
idx[ 2]), *(
const short*)(tab +
idx[ 3]),
1591 *(
const short*)(tab +
idx[ 4]), *(
const short*)(tab +
idx[ 5]), *(
const short*)(tab +
idx[ 6]), *(
const short*)(tab +
idx[ 7]),
1592 *(
const short*)(tab +
idx[ 8]), *(
const short*)(tab +
idx[ 9]), *(
const short*)(tab +
idx[10]), *(
const short*)(tab +
idx[11]),
1593 *(
const short*)(tab +
idx[12]), *(
const short*)(tab +
idx[13]), *(
const short*)(tab +
idx[14]), *(
const short*)(tab +
idx[15])));
1595 inline v_int8x32 v256_lut_quads(
const schar* tab,
const int*
idx)
1597 return v_int8x32(_mm256_i32gather_epi32((
const int*)tab, _mm256_loadu_si256((
const __m256i*)
idx), 1));
1599 inline v_uint8x32 v256_lut(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v256_lut((
const schar *)tab,
idx)); }
1600 inline v_uint8x32 v256_lut_pairs(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v256_lut_pairs((
const schar *)tab,
idx)); }
1601 inline v_uint8x32 v256_lut_quads(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v256_lut_quads((
const schar *)tab,
idx)); }
1603 inline v_int16x16 v256_lut(
const short* tab,
const int*
idx)
1605 return v_int16x16(_mm256_setr_epi16(tab[
idx[0]], tab[
idx[1]], tab[
idx[ 2]], tab[
idx[ 3]], tab[
idx[ 4]], tab[
idx[ 5]], tab[
idx[ 6]], tab[
idx[ 7]],
1608 inline v_int16x16 v256_lut_pairs(
const short* tab,
const int*
idx)
1610 return v_int16x16(_mm256_i32gather_epi32((
const int*)tab, _mm256_loadu_si256((
const __m256i*)
idx), 2));
1612 inline v_int16x16 v256_lut_quads(
const short* tab,
const int*
idx)
1614 #if defined(__GNUC__)
1615 return v_int16x16(_mm256_i32gather_epi64((
const long long int*)tab, _mm_loadu_si128((
const __m128i*)
idx), 2));
1617 return v_int16x16(_mm256_i32gather_epi64((
const int64*)tab, _mm_loadu_si128((
const __m128i*)
idx), 2));
1620 inline v_uint16x16 v256_lut(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v256_lut((
const short *)tab,
idx)); }
1621 inline v_uint16x16 v256_lut_pairs(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v256_lut_pairs((
const short *)tab,
idx)); }
1622 inline v_uint16x16 v256_lut_quads(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v256_lut_quads((
const short *)tab,
idx)); }
1624 inline v_int32x8 v256_lut(
const int* tab,
const int*
idx)
1626 return v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((
const __m256i*)
idx), 4));
1628 inline v_int32x8 v256_lut_pairs(
const int* tab,
const int*
idx)
1630 #if defined(__GNUC__)
1631 return v_int32x8(_mm256_i32gather_epi64((
const long long int*)tab, _mm_loadu_si128((
const __m128i*)
idx), 4));
1633 return v_int32x8(_mm256_i32gather_epi64((
const int64*)tab, _mm_loadu_si128((
const __m128i*)
idx), 4));
1636 inline v_int32x8 v256_lut_quads(
const int* tab,
const int*
idx)
1638 return v_int32x8(_v256_combine(_mm_loadu_si128((
const __m128i*)(tab +
idx[0])), _mm_loadu_si128((
const __m128i*)(tab +
idx[1]))));
1640 inline v_uint32x8 v256_lut(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v256_lut((
const int *)tab,
idx)); }
1641 inline v_uint32x8 v256_lut_pairs(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v256_lut_pairs((
const int *)tab,
idx)); }
1642 inline v_uint32x8 v256_lut_quads(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v256_lut_quads((
const int *)tab,
idx)); }
1644 inline v_int64x4 v256_lut(
const int64* tab,
const int*
idx)
1646 #if defined(__GNUC__)
1647 return v_int64x4(_mm256_i32gather_epi64((
const long long int*)tab, _mm_loadu_si128((
const __m128i*)
idx), 8));
1649 return v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((
const __m128i*)
idx), 8));
1652 inline v_int64x4 v256_lut_pairs(
const int64* tab,
const int*
idx)
1654 return v_int64x4(_v256_combine(_mm_loadu_si128((
const __m128i*)(tab +
idx[0])), _mm_loadu_si128((
const __m128i*)(tab +
idx[1]))));
1656 inline v_uint64x4 v256_lut(
const uint64* tab,
const int*
idx) {
return v_reinterpret_as_u64(v256_lut((
const int64 *)tab,
idx)); }
1657 inline v_uint64x4 v256_lut_pairs(
const uint64* tab,
const int*
idx) {
return v_reinterpret_as_u64(v256_lut_pairs((
const int64 *)tab,
idx)); }
1659 inline v_float32x8 v256_lut(
const float* tab,
const int*
idx)
1661 return v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((
const __m256i*)
idx), 4));
1663 inline v_float32x8 v256_lut_pairs(
const float* tab,
const int*
idx) {
return v_reinterpret_as_f32(v256_lut_pairs((
const int *)tab,
idx)); }
1664 inline v_float32x8 v256_lut_quads(
const float* tab,
const int*
idx) {
return v_reinterpret_as_f32(v256_lut_quads((
const int *)tab,
idx)); }
1666 inline v_float64x4 v256_lut(
const double* tab,
const int*
idx)
1668 return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((
const __m128i*)
idx), 8));
1670 inline v_float64x4 v256_lut_pairs(
const double* tab,
const int*
idx) {
return v_float64x4(_v256_combine(_mm_loadu_pd(tab +
idx[0]), _mm_loadu_pd(tab +
idx[1]))); }
1672 inline v_int32x8
v_lut(
const int* tab,
const v_int32x8& idxvec)
1674 return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));
1677 inline v_uint32x8
v_lut(
const unsigned* tab,
const v_int32x8& idxvec)
1679 return v_reinterpret_as_u32(
v_lut((
const int *)tab, idxvec));
1682 inline v_float32x8
v_lut(
const float* tab,
const v_int32x8& idxvec)
1684 return v_float32x8(_mm256_i32gather_ps(tab, idxvec.val, 4));
1687 inline v_float64x4
v_lut(
const double* tab,
const v_int32x8& idxvec)
1689 return v_float64x4(_mm256_i32gather_pd(tab, _mm256_castsi256_si128(idxvec.val), 8));
1692 inline void v_lut_deinterleave(
const float* tab,
const v_int32x8& idxvec, v_float32x8&
x, v_float32x8&
y)
1696 __m128 z = _mm_setzero_ps();
1697 __m128 xy01, xy45, xy23, xy67;
1698 xy01 = _mm_loadl_pi(z, (const __m64*)(tab +
idx[0]));
1699 xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab +
idx[1]));
1700 xy45 = _mm_loadl_pi(z, (const __m64*)(tab +
idx[4]));
1701 xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab +
idx[5]));
1702 __m256 xy0145 = _v256_combine(xy01, xy45);
1703 xy23 = _mm_loadl_pi(z, (const __m64*)(tab +
idx[2]));
1704 xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab +
idx[3]));
1705 xy67 = _mm_loadl_pi(z, (const __m64*)(tab +
idx[6]));
1706 xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab +
idx[7]));
1707 __m256 xy2367 = _v256_combine(xy23, xy67);
1709 __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
1710 __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
1712 x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
1713 y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
1716 inline
void v_lut_deinterleave(const
double* tab, const v_int32x8& idxvec, v_float64x4&
x, v_float64x4&
y)
1720 __m128d xy0 = _mm_loadu_pd(tab +
idx[0]);
1721 __m128d xy2 = _mm_loadu_pd(tab +
idx[2]);
1722 __m128d xy1 = _mm_loadu_pd(tab +
idx[1]);
1723 __m128d xy3 = _mm_loadu_pd(tab +
idx[3]);
1724 __m256d xy02 = _v256_combine(xy0, xy2);
1725 __m256d xy13 = _v256_combine(xy1, xy3);
1727 x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
1728 y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
1733 return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
1738 return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
1744 return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
1749 return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
1755 return v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
1762 return v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))),
1763 _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1769 return v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))),
1770 _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1776 return v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1781 return v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1789 inline v_int32x8
v_dotprod(
const v_int16x16& a,
const v_int16x16& b)
1790 {
return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
1791 inline v_int32x8
v_dotprod(
const v_int16x16& a,
const v_int16x16& b,
const v_int32x8& c)
1795 inline v_int64x4
v_dotprod(
const v_int32x8& a,
const v_int32x8& b)
1797 __m256i even = _mm256_mul_epi32(a.val, b.val);
1798 __m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
1799 return v_int64x4(_mm256_add_epi64(even, odd));
1801 inline v_int64x4
v_dotprod(
const v_int32x8& a,
const v_int32x8& b,
const v_int64x4& c)
1805 inline v_uint32x8
v_dotprod_expand(
const v_uint8x32& a,
const v_uint8x32& b)
1807 __m256i even_m = _mm256_set1_epi32(0xFF00FF00);
1808 __m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
1809 __m256i odd_a = _mm256_srli_epi16(a.val, 8);
1811 __m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
1812 __m256i odd_b = _mm256_srli_epi16(b.val, 8);
1814 __m256i prod0 = _mm256_madd_epi16(even_a, even_b);
1815 __m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
1816 return v_uint32x8(_mm256_add_epi32(prod0, prod1));
1818 inline v_uint32x8
v_dotprod_expand(
const v_uint8x32& a,
const v_uint8x32& b,
const v_uint32x8& c)
1823 __m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
1824 __m256i odd_a = _mm256_srai_epi16(a.val, 8);
1826 __m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
1827 __m256i odd_b = _mm256_srai_epi16(b.val, 8);
1829 __m256i prod0 = _mm256_madd_epi16(even_a, even_b);
1830 __m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
1831 return v_int32x8(_mm256_add_epi32(prod0, prod1));
1833 inline v_int32x8
v_dotprod_expand(
const v_int8x32& a,
const v_int8x32& b,
const v_int32x8& c)
1837 inline v_uint64x4
v_dotprod_expand(
const v_uint16x16& a,
const v_uint16x16& b)
1839 __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
1840 __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
1841 __m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
1842 __m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
1844 __m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
1845 __m256i p13 = _mm256_srli_epi64(mul0, 32);
1846 __m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
1847 __m256i p57 = _mm256_srli_epi64(mul1, 32);
1849 __m256i p15_ = _mm256_add_epi64(p02, p13);
1850 __m256i p9d_ = _mm256_add_epi64(p46, p57);
1852 return v_uint64x4(_mm256_add_epi64(
1853 _mm256_unpacklo_epi64(p15_, p9d_),
1854 _mm256_unpackhi_epi64(p15_, p9d_)
1857 inline v_uint64x4
v_dotprod_expand(
const v_uint16x16& a,
const v_uint16x16& b,
const v_uint64x4& c)
1860 inline v_int64x4
v_dotprod_expand(
const v_int16x16& a,
const v_int16x16& b)
1862 __m256i prod = _mm256_madd_epi16(a.val, b.val);
1863 __m256i sign = _mm256_srai_epi32(prod, 31);
1865 __m256i lo = _mm256_unpacklo_epi32(prod, sign);
1866 __m256i hi = _mm256_unpackhi_epi32(prod, sign);
1868 return v_int64x4(_mm256_add_epi64(
1869 _mm256_unpacklo_epi64(lo, hi),
1870 _mm256_unpackhi_epi64(lo, hi)
1873 inline v_int64x4
v_dotprod_expand(
const v_int16x16& a,
const v_int16x16& b,
const v_int64x4& c)
1877 inline v_float64x4
v_dotprod_expand(
const v_int32x8& a,
const v_int32x8& b)
1879 inline v_float64x4
v_dotprod_expand(
const v_int32x8& a,
const v_int32x8& b,
const v_float64x4& c)
1885 inline v_int32x8
v_dotprod_fast(
const v_int16x16& a,
const v_int16x16& b)
1887 inline v_int32x8
v_dotprod_fast(
const v_int16x16& a,
const v_int16x16& b,
const v_int32x8& c)
1891 inline v_int64x4
v_dotprod_fast(
const v_int32x8& a,
const v_int32x8& b)
1893 inline v_int64x4
v_dotprod_fast(
const v_int32x8& a,
const v_int32x8& b,
const v_int64x4& c)
1899 inline v_uint32x8
v_dotprod_expand_fast(
const v_uint8x32& a,
const v_uint8x32& b,
const v_uint32x8& c)
1910 __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
1911 __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
1912 __m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
1913 __m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
1915 __m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
1916 __m256i p13 = _mm256_srli_epi64(mul0, 32);
1917 __m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
1918 __m256i p57 = _mm256_srli_epi64(mul1, 32);
1920 __m256i p15_ = _mm256_add_epi64(p02, p13);
1921 __m256i p9d_ = _mm256_add_epi64(p46, p57);
1923 return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
1925 inline v_uint64x4
v_dotprod_expand_fast(
const v_uint16x16& a,
const v_uint16x16& b,
const v_uint64x4& c)
1930 __m256i prod = _mm256_madd_epi16(a.val, b.val);
1931 __m256i sign = _mm256_srai_epi32(prod, 31);
1932 __m256i lo = _mm256_unpacklo_epi32(prod, sign);
1933 __m256i hi = _mm256_unpackhi_epi32(prod, sign);
1934 return v_int64x4(_mm256_add_epi64(lo, hi));
1936 inline v_int64x4
v_dotprod_expand_fast(
const v_int16x16& a,
const v_int16x16& b,
const v_int64x4& c)
1942 inline v_float64x4
v_dotprod_expand_fast(
const v_int32x8& a,
const v_int32x8& b,
const v_float64x4& c)
1945 #define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
1946 v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1948 inline v_float32x8
v_matmul(
const v_float32x8& v,
const v_float32x8& m0,
1949 const v_float32x8& m1,
const v_float32x8& m2,
1950 const v_float32x8& m3)
1952 v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1953 v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1954 v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1955 v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
1959 inline v_float32x8
v_matmuladd(
const v_float32x8& v,
const v_float32x8& m0,
1960 const v_float32x8& m1,
const v_float32x8& m2,
1961 const v_float32x8& a)
1963 v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1964 v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1965 v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1969 #define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1970 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1971 const _Tpvec& a2, const _Tpvec& a3, \
1972 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1974 __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val)); \
1975 __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val)); \
1976 __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val)); \
1977 __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val)); \
1978 b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1)); \
1979 b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1)); \
1980 b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3)); \
1981 b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3)); \
1984 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1985 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1986 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
1991 #define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1992 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1994 b0.val = intrin(_v256_extract_low(a.val)); \
1995 b1.val = intrin(_v256_extract_high(a.val)); \
1997 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1998 { return _Tpwvec(intrin(_v256_extract_low(a.val))); } \
1999 inline _Tpwvec v_expand_high(const _Tpvec& a) \
2000 { return _Tpwvec(intrin(_v256_extract_high(a.val))); } \
2001 inline _Tpwvec v256_load_expand(const _Tp* ptr) \
2003 __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
2004 return _Tpwvec(intrin(a)); \
2007 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32, v_uint16x16,
uchar, _mm256_cvtepu8_epi16)
2008 OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32, v_int16x16,
schar, _mm256_cvtepi8_epi16)
2009 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8,
ushort, _mm256_cvtepu16_epi32)
2010 OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16, v_int32x8,
short, _mm256_cvtepi16_epi32)
2011 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4,
unsigned, _mm256_cvtepu32_epi64)
2012 OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4,
int, _mm256_cvtepi32_epi64)
2014 #define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin) \
2015 inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
2017 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
2018 return _Tpvec(intrin(a)); \
2021 OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8,
uchar, _mm256_cvtepu8_epi32)
2022 OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8,
schar, _mm256_cvtepi8_epi32)
2026 inline v_int8x32 v_pack(
const v_int16x16& a,
const v_int16x16& b)
2027 {
return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
2029 inline v_uint8x32 v_pack(
const v_uint16x16& a,
const v_uint16x16& b)
2031 __m256i t = _mm256_set1_epi16(255);
2032 __m256i a1 = _mm256_min_epu16(a.val, t);
2033 __m256i b1 = _mm256_min_epu16(b.val, t);
2034 return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
2037 inline v_uint8x32 v_pack_u(
const v_int16x16& a,
const v_int16x16& b)
2039 return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
2047 const __m256i m = _mm256_set1_epi16(255);
2048 __m256i am = _mm256_min_epu16(a.val, m);
2049 am = _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
2053 inline void v_pack_u_store(
uchar* ptr,
const v_int16x16& a)
2056 template<
int n>
inline
2057 v_uint8x32 v_rshr_pack(
const v_uint16x16& a,
const v_uint16x16& b)
2060 v_uint16x16
delta = v256_setall_u16((
short)(1 << (n-1)));
2061 return v_pack_u(v_reinterpret_as_s16((a +
delta) >> n),
2062 v_reinterpret_as_s16((b +
delta) >> n));
2065 template<
int n>
inline
2066 void v_rshr_pack_store(
uchar* ptr,
const v_uint16x16& a)
2068 v_uint16x16
delta = v256_setall_u16((
short)(1 << (n-1)));
2069 v_pack_u_store(ptr, v_reinterpret_as_s16((a +
delta) >> n));
2072 template<
int n>
inline
2073 v_uint8x32 v_rshr_pack_u(
const v_int16x16& a,
const v_int16x16& b)
2075 v_int16x16
delta = v256_setall_s16((
short)(1 << (n-1)));
2076 return v_pack_u((a +
delta) >> n, (b +
delta) >> n);
2079 template<
int n>
inline
2080 void v_rshr_pack_u_store(
uchar* ptr,
const v_int16x16& a)
2082 v_int16x16
delta = v256_setall_s16((
short)(1 << (n-1)));
2083 v_pack_u_store(ptr, (a +
delta) >> n);
2086 template<
int n>
inline
2087 v_int8x32 v_rshr_pack(
const v_int16x16& a,
const v_int16x16& b)
2089 v_int16x16
delta = v256_setall_s16((
short)(1 << (n-1)));
2090 return v_pack((a +
delta) >> n, (b +
delta) >> n);
2093 template<
int n>
inline
2094 void v_rshr_pack_store(
schar* ptr,
const v_int16x16& a)
2096 v_int16x16
delta = v256_setall_s16((
short)(1 << (n-1)));
2101 inline v_int16x16 v_pack(
const v_int32x8& a,
const v_int32x8& b)
2102 {
return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
2104 inline v_uint16x16 v_pack(
const v_uint32x8& a,
const v_uint32x8& b)
2105 {
return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
2107 inline v_uint16x16 v_pack_u(
const v_int32x8& a,
const v_int32x8& b)
2108 {
return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
2110 inline void v_pack_store(
short* ptr,
const v_int32x8& a)
2115 const __m256i m = _mm256_set1_epi32(65535);
2116 __m256i am = _mm256_min_epu32(a.val, m);
2117 am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
2121 inline void v_pack_u_store(
ushort* ptr,
const v_int32x8& a)
2125 template<
int n>
inline
2126 v_uint16x16 v_rshr_pack(
const v_uint32x8& a,
const v_uint32x8& b)
2129 v_uint32x8
delta = v256_setall_u32(1 << (n-1));
2130 return v_pack_u(v_reinterpret_as_s32((a +
delta) >> n),
2131 v_reinterpret_as_s32((b +
delta) >> n));
2134 template<
int n>
inline
2135 void v_rshr_pack_store(
ushort* ptr,
const v_uint32x8& a)
2137 v_uint32x8
delta = v256_setall_u32(1 << (n-1));
2138 v_pack_u_store(ptr, v_reinterpret_as_s32((a +
delta) >> n));
2141 template<
int n>
inline
2142 v_uint16x16 v_rshr_pack_u(
const v_int32x8& a,
const v_int32x8& b)
2144 v_int32x8
delta = v256_setall_s32(1 << (n-1));
2145 return v_pack_u((a +
delta) >> n, (b +
delta) >> n);
2148 template<
int n>
inline
2149 void v_rshr_pack_u_store(
ushort* ptr,
const v_int32x8& a)
2151 v_int32x8
delta = v256_setall_s32(1 << (n-1));
2152 v_pack_u_store(ptr, (a +
delta) >> n);
2155 template<
int n>
inline
2156 v_int16x16 v_rshr_pack(
const v_int32x8& a,
const v_int32x8& b)
2158 v_int32x8
delta = v256_setall_s32(1 << (n-1));
2159 return v_pack((a +
delta) >> n, (b +
delta) >> n);
2162 template<
int n>
inline
2163 void v_rshr_pack_store(
short* ptr,
const v_int32x8& a)
2165 v_int32x8
delta = v256_setall_s32(1 << (n-1));
2171 inline v_uint32x8 v_pack(
const v_uint64x4& a,
const v_uint64x4& b)
2173 __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
2174 __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
2175 __m256i ab = _mm256_unpacklo_epi64(a0, b0);
2176 return v_uint32x8(_v256_shuffle_odd_64(ab));
2179 inline v_int32x8 v_pack(
const v_int64x4& a,
const v_int64x4& b)
2180 {
return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2182 inline void v_pack_store(
unsigned* ptr,
const v_uint64x4& a)
2184 __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
2185 v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
2189 {
v_pack_store((
unsigned*)ptr, v_reinterpret_as_u64(b)); }
2191 template<
int n>
inline
2192 v_uint32x8 v_rshr_pack(
const v_uint64x4& a,
const v_uint64x4& b)
2194 v_uint64x4
delta = v256_setall_u64((
uint64)1 << (n-1));
2195 return v_pack((a +
delta) >> n, (b +
delta) >> n);
2198 template<
int n>
inline
2199 void v_rshr_pack_store(
unsigned* ptr,
const v_uint64x4& a)
2201 v_uint64x4
delta = v256_setall_u64((
uint64)1 << (n-1));
2205 template<
int n>
inline
2206 v_int32x8 v_rshr_pack(
const v_int64x4& a,
const v_int64x4& b)
2208 v_int64x4
delta = v256_setall_s64((
int64)1 << (n-1));
2209 return v_pack((a +
delta) >> n, (b +
delta) >> n);
2212 template<
int n>
inline
2213 void v_rshr_pack_store(
int* ptr,
const v_int64x4& a)
2215 v_int64x4
delta = v256_setall_s64((
int64)1 << (n-1));
2220 inline v_uint8x32
v_pack_b(
const v_uint16x16& a,
const v_uint16x16& b)
2222 __m256i ab = _mm256_packs_epi16(a.val, b.val);
2223 return v_uint8x32(_v256_shuffle_odd_64(ab));
2226 inline v_uint8x32
v_pack_b(
const v_uint32x8& a,
const v_uint32x8& b,
2227 const v_uint32x8& c,
const v_uint32x8& d)
2229 __m256i ab = _mm256_packs_epi32(a.val, b.val);
2230 __m256i cd = _mm256_packs_epi32(c.val, d.val);
2232 __m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
2233 return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
2236 inline v_uint8x32
v_pack_b(
const v_uint64x4& a,
const v_uint64x4& b,
const v_uint64x4& c,
2237 const v_uint64x4& d,
const v_uint64x4& e,
const v_uint64x4& f,
2238 const v_uint64x4& g,
const v_uint64x4& h)
2240 __m256i ab = _mm256_packs_epi32(a.val, b.val);
2241 __m256i cd = _mm256_packs_epi32(c.val, d.val);
2242 __m256i ef = _mm256_packs_epi32(e.val, f.val);
2243 __m256i gh = _mm256_packs_epi32(g.val, h.val);
2245 __m256i abcd = _mm256_packs_epi32(ab, cd);
2246 __m256i efgh = _mm256_packs_epi32(ef, gh);
2247 __m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
2249 __m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
2250 return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
2257 #define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec) \
2259 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2260 { return v_rotate_right<s>(a, b); }
2262 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
2263 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
2264 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
2265 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
2266 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
2267 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
2268 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
2269 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
2270 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
2271 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
2276 return (
uchar)_v256_extract_epi8<i>(a.val);
2282 return (
schar)v_extract_n<i>(v_reinterpret_as_u8(a));
2288 return (
ushort)_v256_extract_epi16<i>(a.val);
2294 return (
short)v_extract_n<i>(v_reinterpret_as_u16(a));
2300 return (
uint)_v256_extract_epi32<i>(a.val);
2306 return (
int)v_extract_n<i>(v_reinterpret_as_u32(a));
2312 return (
uint64)_v256_extract_epi64<i>(a.val);
2318 return (
int64)v_extract_n<i>(v_reinterpret_as_u64(v));
2324 union {
uint iv;
float fv; } d;
2325 d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
2332 union {
uint64 iv;
double dv; } d;
2333 d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
2340 static const __m256i perm = _mm256_set1_epi32((
char)i);
2341 return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
2346 {
return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2350 {
return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2357 __m256i ab0 = _mm256_loadu_si256((
const __m256i*)ptr);
2358 __m256i ab1 = _mm256_loadu_si256((
const __m256i*)(ptr + 32));
2360 const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
2361 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
2362 __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
2363 __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
2364 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2365 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2366 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2367 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2374 __m256i ab0 = _mm256_loadu_si256((
const __m256i*)ptr);
2375 __m256i ab1 = _mm256_loadu_si256((
const __m256i*)(ptr + 16));
2377 const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2378 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
2379 __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
2380 __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
2381 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2382 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2383 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2384 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2385 a = v_uint16x16(a0);
2386 b = v_uint16x16(b0);
2391 __m256i ab0 = _mm256_loadu_si256((
const __m256i*)ptr);
2392 __m256i ab1 = _mm256_loadu_si256((
const __m256i*)(ptr + 8));
2394 enum { sh = 0+2*4+1*16+3*64 };
2395 __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
2396 __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
2397 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2398 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2399 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2400 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2407 __m256i ab0 = _mm256_loadu_si256((
const __m256i*)ptr);
2408 __m256i ab1 = _mm256_loadu_si256((
const __m256i*)(ptr + 4));
2410 __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
2411 __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
2412 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
2413 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
2420 __m256i bgr0 = _mm256_loadu_si256((
const __m256i*)ptr);
2421 __m256i bgr1 = _mm256_loadu_si256((
const __m256i*)(ptr + 32));
2422 __m256i bgr2 = _mm256_loadu_si256((
const __m256i*)(ptr + 64));
2424 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2425 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2427 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2428 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2429 const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2430 -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
2432 __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
2433 __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
2434 __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
2437 sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
2438 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
2439 sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
2440 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
2441 sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
2442 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2443 b0 = _mm256_shuffle_epi8(b0, sh_b);
2444 g0 = _mm256_shuffle_epi8(g0, sh_g);
2445 r0 = _mm256_shuffle_epi8(r0, sh_r);
2454 __m256i bgr0 = _mm256_loadu_si256((
const __m256i*)ptr);
2455 __m256i bgr1 = _mm256_loadu_si256((
const __m256i*)(ptr + 16));
2456 __m256i bgr2 = _mm256_loadu_si256((
const __m256i*)(ptr + 32));
2458 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2459 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2461 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2462 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2463 const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2464 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2465 __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
2466 __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
2467 __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
2468 const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2469 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2470 const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2471 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2472 const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2473 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2474 b0 = _mm256_shuffle_epi8(b0, sh_b);
2475 g0 = _mm256_shuffle_epi8(g0, sh_g);
2476 r0 = _mm256_shuffle_epi8(r0, sh_r);
2478 a = v_uint16x16(b0);
2479 b = v_uint16x16(g0);
2480 c = v_uint16x16(r0);
2483 inline void v_load_deinterleave(
const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
2485 __m256i bgr0 = _mm256_loadu_si256((
const __m256i*)ptr);
2486 __m256i bgr1 = _mm256_loadu_si256((
const __m256i*)(ptr + 8));
2487 __m256i bgr2 = _mm256_loadu_si256((
const __m256i*)(ptr + 16));
2489 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
2490 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
2492 __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
2493 __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
2494 __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
2496 b0 = _mm256_shuffle_epi32(b0, 0x6c);
2497 g0 = _mm256_shuffle_epi32(g0, 0xb1);
2498 r0 = _mm256_shuffle_epi32(r0, 0xc6);
2507 __m256i bgr0 = _mm256_loadu_si256((
const __m256i*)ptr);
2508 __m256i bgr1 = _mm256_loadu_si256((
const __m256i*)(ptr + 4));
2509 __m256i bgr2 = _mm256_loadu_si256((
const __m256i*)(ptr + 8));
2511 __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
2512 __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
2513 __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
2514 __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
2515 __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
2516 __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
2525 __m256i bgr0 = _mm256_loadu_si256((
const __m256i*)ptr);
2526 __m256i bgr1 = _mm256_loadu_si256((
const __m256i*)(ptr + 32));
2527 __m256i bgr2 = _mm256_loadu_si256((
const __m256i*)(ptr + 64));
2528 __m256i bgr3 = _mm256_loadu_si256((
const __m256i*)(ptr + 96));
2529 const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
2530 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
2532 __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
2533 __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
2534 __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
2535 __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
2537 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2538 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2539 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2540 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2542 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2543 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2544 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2545 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2547 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2548 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2549 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2550 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2560 __m256i bgr0 = _mm256_loadu_si256((
const __m256i*)ptr);
2561 __m256i bgr1 = _mm256_loadu_si256((
const __m256i*)(ptr + 16));
2562 __m256i bgr2 = _mm256_loadu_si256((
const __m256i*)(ptr + 32));
2563 __m256i bgr3 = _mm256_loadu_si256((
const __m256i*)(ptr + 48));
2564 const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
2565 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
2566 __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
2567 __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
2568 __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
2569 __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
2571 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2572 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2573 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2574 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2576 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2577 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2578 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2579 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2581 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2582 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2583 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2584 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2586 a = v_uint16x16(b0);
2587 b = v_uint16x16(g0);
2588 c = v_uint16x16(r0);
2589 d = v_uint16x16(a0);
2592 inline void v_load_deinterleave(
const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
2594 __m256i p0 = _mm256_loadu_si256((
const __m256i*)ptr);
2595 __m256i p1 = _mm256_loadu_si256((
const __m256i*)(ptr + 8));
2596 __m256i p2 = _mm256_loadu_si256((
const __m256i*)(ptr + 16));
2597 __m256i p3 = _mm256_loadu_si256((
const __m256i*)(ptr + 24));
2599 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
2600 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
2601 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
2602 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
2604 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
2605 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
2606 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
2607 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
2609 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
2610 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
2611 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
2612 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
2622 __m256i bgra0 = _mm256_loadu_si256((
const __m256i*)ptr);
2623 __m256i bgra1 = _mm256_loadu_si256((
const __m256i*)(ptr + 4));
2624 __m256i bgra2 = _mm256_loadu_si256((
const __m256i*)(ptr + 8));
2625 __m256i bgra3 = _mm256_loadu_si256((
const __m256i*)(ptr + 12));
2627 __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
2628 __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
2629 __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
2630 __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
2632 __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
2633 __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
2634 __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
2635 __m256i a0 = _mm256_unpackhi_epi64(h02, h13);
2648 __m256i xy_l = _mm256_unpacklo_epi8(
x.val,
y.val);
2649 __m256i xy_h = _mm256_unpackhi_epi8(
x.val,
y.val);
2651 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2652 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2656 _mm256_stream_si256((__m256i*)ptr, xy0);
2657 _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
2661 _mm256_store_si256((__m256i*)ptr, xy0);
2662 _mm256_store_si256((__m256i*)(ptr + 32), xy1);
2666 _mm256_storeu_si256((__m256i*)ptr, xy0);
2667 _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
2674 __m256i xy_l = _mm256_unpacklo_epi16(
x.val,
y.val);
2675 __m256i xy_h = _mm256_unpackhi_epi16(
x.val,
y.val);
2677 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2678 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2682 _mm256_stream_si256((__m256i*)ptr, xy0);
2683 _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
2687 _mm256_store_si256((__m256i*)ptr, xy0);
2688 _mm256_store_si256((__m256i*)(ptr + 16), xy1);
2692 _mm256_storeu_si256((__m256i*)ptr, xy0);
2693 _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
2700 __m256i xy_l = _mm256_unpacklo_epi32(
x.val,
y.val);
2701 __m256i xy_h = _mm256_unpackhi_epi32(
x.val,
y.val);
2703 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2704 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2708 _mm256_stream_si256((__m256i*)ptr, xy0);
2709 _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
2713 _mm256_store_si256((__m256i*)ptr, xy0);
2714 _mm256_store_si256((__m256i*)(ptr + 8), xy1);
2718 _mm256_storeu_si256((__m256i*)ptr, xy0);
2719 _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
2726 __m256i xy_l = _mm256_unpacklo_epi64(
x.val,
y.val);
2727 __m256i xy_h = _mm256_unpackhi_epi64(
x.val,
y.val);
2729 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2730 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2734 _mm256_stream_si256((__m256i*)ptr, xy0);
2735 _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
2739 _mm256_store_si256((__m256i*)ptr, xy0);
2740 _mm256_store_si256((__m256i*)(ptr + 4), xy1);
2744 _mm256_storeu_si256((__m256i*)ptr, xy0);
2745 _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
2752 const __m256i sh_b = _mm256_setr_epi8(
2753 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2754 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2755 const __m256i sh_g = _mm256_setr_epi8(
2756 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2757 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2758 const __m256i sh_r = _mm256_setr_epi8(
2759 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2760 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2762 __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
2763 __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
2764 __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
2766 const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2767 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2768 const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2769 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2771 __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2772 __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2773 __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2775 __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2776 __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
2777 __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
2781 _mm256_stream_si256((__m256i*)ptr, bgr0);
2782 _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
2783 _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
2787 _mm256_store_si256((__m256i*)ptr, bgr0);
2788 _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
2789 _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
2793 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2794 _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
2795 _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
2802 const __m256i sh_b = _mm256_setr_epi8(
2803 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2804 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2805 const __m256i sh_g = _mm256_setr_epi8(
2806 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2807 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2808 const __m256i sh_r = _mm256_setr_epi8(
2809 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2810 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2812 __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
2813 __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
2814 __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
2816 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2817 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2818 const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2819 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2821 __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2822 __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2823 __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2825 __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
2827 __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
2831 _mm256_stream_si256((__m256i*)ptr, bgr0);
2832 _mm256_stream_si256((__m256i*)(ptr + 16), p1);
2833 _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
2837 _mm256_store_si256((__m256i*)ptr, bgr0);
2838 _mm256_store_si256((__m256i*)(ptr + 16), p1);
2839 _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
2843 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2844 _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
2845 _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
2849 inline void v_store_interleave(
unsigned* ptr,
const v_uint32x8& a,
const v_uint32x8& b,
const v_uint32x8& c,
2852 __m256i b0 = _mm256_shuffle_epi32(a.val, 0x6c);
2853 __m256i g0 = _mm256_shuffle_epi32(b.val, 0xb1);
2854 __m256i r0 = _mm256_shuffle_epi32(c.val, 0xc6);
2856 __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
2857 __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
2858 __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
2860 __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2862 __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2866 _mm256_stream_si256((__m256i*)ptr, bgr0);
2867 _mm256_stream_si256((__m256i*)(ptr + 8), p2);
2868 _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
2872 _mm256_store_si256((__m256i*)ptr, bgr0);
2873 _mm256_store_si256((__m256i*)(ptr + 8), p2);
2874 _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
2878 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2879 _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
2880 _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
2887 __m256i s01 = _mm256_unpacklo_epi64(a.val, b.val);
2888 __m256i s12 = _mm256_unpackhi_epi64(b.val, c.val);
2889 __m256i s20 = _mm256_blend_epi32(c.val, a.val, 0xcc);
2891 __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
2892 __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
2893 __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
2897 _mm256_stream_si256((__m256i*)ptr, bgr0);
2898 _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
2899 _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
2903 _mm256_store_si256((__m256i*)ptr, bgr0);
2904 _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
2905 _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
2909 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2910 _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
2911 _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
2916 const v_uint8x32& c,
const v_uint8x32& d,
2919 __m256i bg0 = _mm256_unpacklo_epi8(a.val, b.val);
2920 __m256i bg1 = _mm256_unpackhi_epi8(a.val, b.val);
2921 __m256i ra0 = _mm256_unpacklo_epi8(c.val, d.val);
2922 __m256i ra1 = _mm256_unpackhi_epi8(c.val, d.val);
2924 __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
2925 __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
2926 __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
2927 __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
2929 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2930 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2931 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2932 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2936 _mm256_stream_si256((__m256i*)ptr, bgra0);
2937 _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
2938 _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
2939 _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
2943 _mm256_store_si256((__m256i*)ptr, bgra0);
2944 _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
2945 _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
2946 _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
2950 _mm256_storeu_si256((__m256i*)ptr, bgra0);
2951 _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
2952 _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
2953 _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
2958 const v_uint16x16& c,
const v_uint16x16& d,
2961 __m256i bg0 = _mm256_unpacklo_epi16(a.val, b.val);
2962 __m256i bg1 = _mm256_unpackhi_epi16(a.val, b.val);
2963 __m256i ra0 = _mm256_unpacklo_epi16(c.val, d.val);
2964 __m256i ra1 = _mm256_unpackhi_epi16(c.val, d.val);
2966 __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
2967 __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
2968 __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
2969 __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
2971 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2972 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2973 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2974 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2978 _mm256_stream_si256((__m256i*)ptr, bgra0);
2979 _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
2980 _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
2981 _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
2985 _mm256_store_si256((__m256i*)ptr, bgra0);
2986 _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
2987 _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
2988 _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
2992 _mm256_storeu_si256((__m256i*)ptr, bgra0);
2993 _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
2994 _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
2995 _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
2999 inline void v_store_interleave(
unsigned* ptr,
const v_uint32x8& a,
const v_uint32x8& b,
3000 const v_uint32x8& c,
const v_uint32x8& d,
3003 __m256i bg0 = _mm256_unpacklo_epi32(a.val, b.val);
3004 __m256i bg1 = _mm256_unpackhi_epi32(a.val, b.val);
3005 __m256i ra0 = _mm256_unpacklo_epi32(c.val, d.val);
3006 __m256i ra1 = _mm256_unpackhi_epi32(c.val, d.val);
3008 __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
3009 __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
3010 __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
3011 __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
3013 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
3014 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
3015 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
3016 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
3020 _mm256_stream_si256((__m256i*)ptr, bgra0);
3021 _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
3022 _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
3023 _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
3027 _mm256_store_si256((__m256i*)ptr, bgra0);
3028 _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
3029 _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
3030 _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
3034 _mm256_storeu_si256((__m256i*)ptr, bgra0);
3035 _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
3036 _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
3037 _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
3042 const v_uint64x4& c,
const v_uint64x4& d,
3045 __m256i bg0 = _mm256_unpacklo_epi64(a.val, b.val);
3046 __m256i bg1 = _mm256_unpackhi_epi64(a.val, b.val);
3047 __m256i ra0 = _mm256_unpacklo_epi64(c.val, d.val);
3048 __m256i ra1 = _mm256_unpackhi_epi64(c.val, d.val);
3050 __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
3051 __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
3052 __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
3053 __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
3057 _mm256_stream_si256((__m256i*)ptr, bgra0);
3058 _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
3059 _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
3060 _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
3064 _mm256_store_si256((__m256i*)ptr, bgra0);
3065 _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
3066 _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
3067 _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
3071 _mm256_storeu_si256((__m256i*)ptr, bgra0);
3072 _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
3073 _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
3074 _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
3078 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
3079 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
3082 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
3083 a0 = v_reinterpret_as_##suffix0(a1); \
3084 b0 = v_reinterpret_as_##suffix0(b1); \
3086 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
3088 _Tpvec1 a1, b1, c1; \
3089 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
3090 a0 = v_reinterpret_as_##suffix0(a1); \
3091 b0 = v_reinterpret_as_##suffix0(b1); \
3092 c0 = v_reinterpret_as_##suffix0(c1); \
3094 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
3096 _Tpvec1 a1, b1, c1, d1; \
3097 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
3098 a0 = v_reinterpret_as_##suffix0(a1); \
3099 b0 = v_reinterpret_as_##suffix0(b1); \
3100 c0 = v_reinterpret_as_##suffix0(c1); \
3101 d0 = v_reinterpret_as_##suffix0(d1); \
3103 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3104 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3106 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3107 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3108 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
3110 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
3111 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3113 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3114 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3115 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3116 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
3118 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3119 const _Tpvec0& c0, const _Tpvec0& d0, \
3120 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3122 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3123 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3124 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3125 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
3126 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3129 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32,
schar, s8, v_uint8x32,
uchar, u8)
3130 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16,
short, s16, v_uint16x16,
ushort, u16)
3131 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8,
int, s32, v_uint32x8,
unsigned, u32)
3132 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8,
float, f32, v_uint32x8,
unsigned, u32)
3133 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4,
int64, s64, v_uint64x4,
uint64, u64)
3134 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4,
double, f64, v_uint64x4,
uint64, u64)
3140 inline v_float32x8 v256_load_expand(
const hfloat* ptr)
3143 return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((
const __m128i*)ptr)));
3146 for (
int i = 0; i < 8; i++)
3147 buf[i] = (
float)ptr[i];
3148 return v256_load_aligned(buf);
3152 inline void v_pack_store(hfloat* ptr,
const v_float32x8& a)
3155 __m128i ah = _mm256_cvtps_ph(a.val, 0);
3156 _mm_storeu_si128((__m128i*)ptr, ah);
3160 for (
int i = 0; i < 8; i++)
3161 ptr[i] = hfloat(buf[i]);
3169 inline void v256_cleanup() { _mm256_zeroall(); }
3171 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition: intrin_cpp.hpp:1515
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition: intrin_cpp.hpp:1554
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2216
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition: intrin_cpp.hpp:1496
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CvRect r
Definition: imgproc_c.h:984
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
StoreMode
Definition: intrin.hpp:100
@ STORE_ALIGNED_NOCACHE
Definition: intrin.hpp:103
@ STORE_ALIGNED
Definition: intrin.hpp:102
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
static CV__DEBUG_NS_BEGIN void swap(MatExpr &a, MatExpr &b)
Definition: mat.inl.hpp:3409
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition: dualquaternion.inl.hpp:274