5 #ifndef OPENCV_HAL_INTRIN_AVX512_HPP
6 #define OPENCV_HAL_INTRIN_AVX512_HPP
8 #if defined(_MSC_VER) && (_MSC_VER < 1920)
9 # pragma warning(disable:4146)
10 # pragma warning(disable:4309)
11 # pragma warning(disable:4310)
14 #define CVT_ROUND_MODES_IMPLEMENTED 0
17 #define CV_SIMD512_64F 1
18 #define CV_SIMD512_FP16 0
20 #define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
21 #define _v512_set_epu32(a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
22 _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
23 ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
24 #define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
25 a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
26 _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
27 ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
28 ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
29 ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
30 #define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
31 a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
32 a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
33 a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
34 _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
35 ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
36 ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
37 ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
38 ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
39 ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
40 ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
41 ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
42 #define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
43 a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
44 a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
45 a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
46 _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
47 (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
48 (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
49 (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
50 (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
51 (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
52 (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
53 (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))
55 #ifndef _mm512_cvtpd_pslo
56 #ifdef _mm512_zextsi256_si512
57 #define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
60 #define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
68 inline __m512i _v512_combine(
const __m256i& lo,
const __m256i& hi)
69 {
return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }
71 inline __m512 _v512_combine(
const __m256& lo,
const __m256& hi)
72 {
return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }
74 inline __m512d _v512_combine(
const __m256d& lo,
const __m256d& hi)
75 {
return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }
77 inline int _v_cvtsi512_si32(
const __m512i& a)
78 {
return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }
80 inline __m256i _v512_extract_high(
const __m512i& v)
81 {
return _mm512_extracti32x8_epi32(v, 1); }
83 inline __m256 _v512_extract_high(
const __m512& v)
84 {
return _mm512_extractf32x8_ps(v, 1); }
86 inline __m256d _v512_extract_high(
const __m512d& v)
87 {
return _mm512_extractf64x4_pd(v, 1); }
89 inline __m256i _v512_extract_low(
const __m512i& v)
90 {
return _mm512_castsi512_si256(v); }
92 inline __m256 _v512_extract_low(
const __m512& v)
93 {
return _mm512_castps512_ps256(v); }
95 inline __m256d _v512_extract_low(
const __m512d& v)
96 {
return _mm512_castpd512_pd256(v); }
98 inline __m512i _v512_insert(
const __m512i& a,
const __m256i& b)
99 {
return _mm512_inserti32x8(a, b, 0); }
101 inline __m512 _v512_insert(
const __m512& a,
const __m256& b)
102 {
return _mm512_insertf32x8(a, b, 0); }
104 inline __m512d _v512_insert(
const __m512d& a,
const __m256d& b)
105 {
return _mm512_insertf64x4(a, b, 0); }
114 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
120 typedef uchar lane_type;
121 enum { nlanes = 64 };
124 explicit v_uint8x64(__m512i v) : val(v) {}
142 val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
143 v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
144 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
145 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
149 static inline v_uint8x64 zero() {
return v_uint8x64(_mm512_setzero_si512()); }
151 uchar get0()
const {
return (
uchar)_v_cvtsi512_si32(val); }
156 typedef schar lane_type;
157 enum { nlanes = 64 };
160 explicit v_int8x64(__m512i v) : val(v) {}
178 val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
179 v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
180 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
181 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
185 static inline v_int8x64 zero() {
return v_int8x64(_mm512_setzero_si512()); }
187 schar get0()
const {
return (
schar)_v_cvtsi512_si32(val); }
193 enum { nlanes = 32 };
196 explicit v_uint16x32(__m512i v) : val(v) {}
206 val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
207 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
211 static inline v_uint16x32 zero() {
return v_uint16x32(_mm512_setzero_si512()); }
213 ushort get0()
const {
return (
ushort)_v_cvtsi512_si32(val); }
218 typedef short lane_type;
219 enum { nlanes = 32 };
222 explicit v_int16x32(__m512i v) : val(v) {}
223 v_int16x32(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7,
224 short v8,
short v9,
short v10,
short v11,
short v12,
short v13,
short v14,
short v15,
225 short v16,
short v17,
short v18,
short v19,
short v20,
short v21,
short v22,
short v23,
226 short v24,
short v25,
short v26,
short v27,
short v28,
short v29,
short v30,
short v31)
235 static inline v_int16x32 zero() {
return v_int16x32(_mm512_setzero_si512()); }
237 short get0()
const {
return (
short)_v_cvtsi512_si32(val); }
242 typedef unsigned lane_type;
243 enum { nlanes = 16 };
246 explicit v_uint32x16(__m512i v) : val(v) {}
247 v_uint32x16(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3,
248 unsigned v4,
unsigned v5,
unsigned v6,
unsigned v7,
249 unsigned v8,
unsigned v9,
unsigned v10,
unsigned v11,
250 unsigned v12,
unsigned v13,
unsigned v14,
unsigned v15)
252 val = _mm512_setr_epi32((
int)v0, (
int)v1, (
int)v2, (
int)v3, (
int)v4, (
int)v5, (
int)v6, (
int)v7,
253 (
int)v8, (
int)v9, (
int)v10, (
int)v11, (
int)v12, (
int)v13, (
int)v14, (
int)v15);
257 static inline v_uint32x16 zero() {
return v_uint32x16(_mm512_setzero_si512()); }
259 unsigned get0()
const {
return (
unsigned)_v_cvtsi512_si32(val); }
264 typedef int lane_type;
265 enum { nlanes = 16 };
268 explicit v_int32x16(__m512i v) : val(v) {}
269 v_int32x16(
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
270 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15)
272 val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
276 static inline v_int32x16 zero() {
return v_int32x16(_mm512_setzero_si512()); }
278 int get0()
const {
return _v_cvtsi512_si32(val); }
283 typedef float lane_type;
284 enum { nlanes = 16 };
287 explicit v_float32x16(__m512 v) : val(v) {}
288 v_float32x16(
float v0,
float v1,
float v2,
float v3,
float v4,
float v5,
float v6,
float v7,
289 float v8,
float v9,
float v10,
float v11,
float v12,
float v13,
float v14,
float v15)
291 val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
295 static inline v_float32x16 zero() {
return v_float32x16(_mm512_setzero_ps()); }
297 float get0()
const {
return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
306 explicit v_uint64x8(__m512i v) : val(v) {}
311 static inline v_uint64x8 zero() {
return v_uint64x8(_mm512_setzero_si512()); }
315 #if defined __x86_64__ || defined _M_X64
316 return (
uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
318 int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
319 int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
320 return (
unsigned)a | ((
uint64)(
unsigned)b << 32);
327 typedef int64 lane_type;
331 explicit v_int64x8(__m512i v) : val(v) {}
333 { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
336 static inline v_int64x8 zero() {
return v_int64x8(_mm512_setzero_si512()); }
340 #if defined __x86_64__ || defined _M_X64
341 return (
int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
343 int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
344 int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
345 return (
int64)((unsigned)a | ((
uint64)(unsigned)b << 32));
352 typedef double lane_type;
356 explicit v_float64x8(__m512d v) : val(v) {}
357 v_float64x8(
double v0,
double v1,
double v2,
double v3,
double v4,
double v5,
double v6,
double v7)
358 { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
361 static inline v_float64x8 zero() {
return v_float64x8(_mm512_setzero_pd()); }
363 double get0()
const {
return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
368 #define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp) \
369 inline _Tpvec v512_load(const _Tp* ptr) \
370 { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); } \
371 inline _Tpvec v512_load_aligned(const _Tp* ptr) \
372 { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); } \
373 inline _Tpvec v512_load_low(const _Tp* ptr) \
375 __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr); \
376 return _Tpvec(_mm512_castsi256_si512(v256)); \
378 inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
380 __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0); \
381 __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1); \
382 return _Tpvec(_v512_combine(vlo, vhi)); \
384 inline void v_store(_Tp* ptr, const _Tpvec& a) \
385 { _mm512_storeu_si512((__m512i*)ptr, a.val); } \
386 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
387 { _mm512_store_si512((__m512i*)ptr, a.val); } \
388 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
389 { _mm512_stream_si512((__m512i*)ptr, a.val); } \
390 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
392 if( mode == hal::STORE_UNALIGNED ) \
393 _mm512_storeu_si512((__m512i*)ptr, a.val); \
394 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
395 _mm512_stream_si512((__m512i*)ptr, a.val); \
397 _mm512_store_si512((__m512i*)ptr, a.val); \
399 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
400 { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); } \
401 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
402 { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }
404 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64,
uchar)
405 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64,
schar)
406 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32,
ushort)
407 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32,
short)
408 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16,
unsigned)
409 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16,
int)
410 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8,
uint64)
411 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8,
int64)
413 #define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
414 inline _Tpvec v512_load(const _Tp* ptr) \
415 { return _Tpvec(_mm512_loadu_##suffix(ptr)); } \
416 inline _Tpvec v512_load_aligned(const _Tp* ptr) \
417 { return _Tpvec(_mm512_load_##suffix(ptr)); } \
418 inline _Tpvec v512_load_low(const _Tp* ptr) \
420 return _Tpvec(_mm512_cast##suffix##256_##suffix##512 \
421 (_mm256_loadu_##suffix(ptr))); \
423 inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
425 halfreg vlo = _mm256_loadu_##suffix(ptr0); \
426 halfreg vhi = _mm256_loadu_##suffix(ptr1); \
427 return _Tpvec(_v512_combine(vlo, vhi)); \
429 inline void v_store(_Tp* ptr, const _Tpvec& a) \
430 { _mm512_storeu_##suffix(ptr, a.val); } \
431 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
432 { _mm512_store_##suffix(ptr, a.val); } \
433 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
434 { _mm512_stream_##suffix(ptr, a.val); } \
435 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
437 if( mode == hal::STORE_UNALIGNED ) \
438 _mm512_storeu_##suffix(ptr, a.val); \
439 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
440 _mm512_stream_##suffix(ptr, a.val); \
442 _mm512_store_##suffix(ptr, a.val); \
444 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
445 { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); } \
446 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
447 { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }
449 OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16,
float, ps, __m256)
450 OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8,
double, pd, __m256d)
452 #define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
453 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
454 { return _Tpvec(cast(a.val)); }
456 #define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
457 inline _Tpvec v512_setzero_##suffix() \
458 { return _Tpvec(_mm512_setzero_si512()); } \
459 inline _Tpvec v512_setall_##suffix(_Tp v) \
460 { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); } \
461 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, OPENCV_HAL_NOP) \
462 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, OPENCV_HAL_NOP) \
463 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, OPENCV_HAL_NOP) \
464 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, OPENCV_HAL_NOP) \
465 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, OPENCV_HAL_NOP) \
466 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, OPENCV_HAL_NOP) \
467 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, OPENCV_HAL_NOP) \
468 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, OPENCV_HAL_NOP) \
469 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
470 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8, suffix, _mm512_castpd_si512)
472 OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64,
uchar, u8, epi8,
char)
473 OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64,
schar, s8, epi8,
char)
474 OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32,
ushort, u16, epi16,
short)
475 OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32,
short, s16, epi16,
short)
476 OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16,
unsigned, u32, epi32,
int)
477 OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16,
int, s32, epi32,
int)
478 OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8,
uint64, u64, epi64,
int64)
479 OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,
int64, s64, epi64,
int64)
481 #define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
482 inline _Tpvec v512_setzero_##suffix() \
483 { return _Tpvec(_mm512_setzero_##zsuffix()); } \
484 inline _Tpvec v512_setall_##suffix(_Tp v) \
485 { return _Tpvec(_mm512_set1_##zsuffix(v)); } \
486 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, cast) \
487 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, cast) \
488 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast) \
489 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, cast) \
490 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast) \
491 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, cast) \
492 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, cast) \
493 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, cast)
495 OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16,
float, f32, ps, _mm512_castsi512_ps)
496 OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8,
double, f64, pd, _mm512_castsi512_pd)
498 inline v_float32x16 v_reinterpret_as_f32(
const v_float32x16& a)
500 inline v_float32x16 v_reinterpret_as_f32(
const v_float64x8& a)
501 {
return v_float32x16(_mm512_castpd_ps(a.val)); }
503 inline v_float64x8 v_reinterpret_as_f64(
const v_float64x8& a)
505 inline v_float64x8 v_reinterpret_as_f64(
const v_float32x16& a)
506 {
return v_float64x8(_mm512_castps_pd(a.val)); }
509 inline v_float32x16 v512_load_expand(
const hfloat* ptr)
511 return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((
const __m256i*)ptr)));
514 inline void v_pack_store(hfloat* ptr,
const v_float32x16& a)
516 __m256i ah = _mm512_cvtps_ph(a.val, 0);
517 _mm256_storeu_si256((__m256i*)ptr, ah);
521 inline void v_zip(
const v_int8x64& a,
const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
524 __m512i mask0 = _v512_set_epu8( 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24,
525 87, 23, 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16,
526 79, 15, 78, 14, 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8,
527 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, 64, 0);
528 ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
529 __m512i mask1 = _v512_set_epu8(127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
530 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
531 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
532 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32);
533 ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
535 __m512i low = _mm512_unpacklo_epi8(a.val, b.val);
536 __m512i high = _mm512_unpackhi_epi8(a.val, b.val);
537 ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2, 9, 8, 1, 0), high));
538 ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
541 inline void v_zip(
const v_int16x32& a,
const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
543 __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8,
544 39, 7, 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0);
545 ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
546 __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
547 55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
548 ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
550 inline void v_zip(
const v_int32x16& a,
const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
552 __m512i mask0 = _v512_set_epu32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
553 ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
554 __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
555 ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
557 inline void v_zip(
const v_int64x8& a,
const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
559 __m512i mask0 = _v512_set_epu64(11, 3, 10, 2, 9, 1, 8, 0);
560 ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
561 __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
562 ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
565 inline void v_zip(
const v_uint8x64& a,
const v_uint8x64& b, v_uint8x64& ab0, v_uint8x64& ab1)
568 v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
569 ab0 = v_reinterpret_as_u8(i0);
570 ab1 = v_reinterpret_as_u8(i1);
572 inline void v_zip(
const v_uint16x32& a,
const v_uint16x32& b, v_uint16x32& ab0, v_uint16x32& ab1)
575 v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
576 ab0 = v_reinterpret_as_u16(i0);
577 ab1 = v_reinterpret_as_u16(i1);
579 inline void v_zip(
const v_uint32x16& a,
const v_uint32x16& b, v_uint32x16& ab0, v_uint32x16& ab1)
582 v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
583 ab0 = v_reinterpret_as_u32(i0);
584 ab1 = v_reinterpret_as_u32(i1);
586 inline void v_zip(
const v_uint64x8& a,
const v_uint64x8& b, v_uint64x8& ab0, v_uint64x8& ab1)
589 v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
590 ab0 = v_reinterpret_as_u64(i0);
591 ab1 = v_reinterpret_as_u64(i1);
593 inline void v_zip(
const v_float32x16& a,
const v_float32x16& b, v_float32x16& ab0, v_float32x16& ab1)
596 v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
597 ab0 = v_reinterpret_as_f32(i0);
598 ab1 = v_reinterpret_as_f32(i1);
600 inline void v_zip(
const v_float64x8& a,
const v_float64x8& b, v_float64x8& ab0, v_float64x8& ab1)
603 v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
604 ab0 = v_reinterpret_as_f64(i0);
605 ab1 = v_reinterpret_as_f64(i1);
608 #define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix) \
609 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
610 { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
611 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
612 { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); } \
613 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
614 _Tpvec& c, _Tpvec& d) \
616 c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val)); \
617 d.val = _v512_insert(b.val,_v512_extract_high(a.val)); \
621 OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64, epi8)
622 OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64, epi8)
623 OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32, epi16)
624 OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32, epi16)
625 OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16, epi32)
626 OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16, epi32)
627 OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8, epi64)
628 OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8, epi64)
629 OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
630 OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8, pd)
637 #define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
638 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
639 { return _Tpvec(intrin(a.val, b.val)); }
641 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
642 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
643 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
644 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
645 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
646 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
647 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
648 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
649 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
650 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)
652 inline v_uint8x64 v_mul_wrap(
const v_uint8x64& a,
const v_uint8x64& b)
654 __m512i ad = _mm512_srai_epi16(a.val, 8);
655 __m512i bd = _mm512_srai_epi16(b.val, 8);
656 __m512i p0 = _mm512_mullo_epi16(a.val, b.val);
657 __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8);
658 return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
660 inline v_int8x64 v_mul_wrap(
const v_int8x64& a,
const v_int8x64& b)
662 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
665 #define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \
666 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
667 { return _Tpvec(intrin(a.val, b.val)); } \
668 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
669 { a.val = intrin(a.val, b.val); return a; }
671 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
672 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
673 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
674 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
675 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
676 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
677 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
678 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
680 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
681 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
682 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
683 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
686 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8)
687 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8)
688 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8)
689 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8)
690 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
691 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
692 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16)
693 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16)
695 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
696 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
697 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
698 OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
699 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
700 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
701 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
702 OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
705 inline v_uint8x64
operator * (
const v_uint8x64& a,
const v_uint8x64& b)
711 inline v_int8x64
operator * (
const v_int8x64& a,
const v_int8x64& b)
717 inline v_uint16x32
operator * (
const v_uint16x32& a,
const v_uint16x32& b)
719 __m512i pl = _mm512_mullo_epi16(a.val, b.val);
720 __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
721 __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
722 __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
724 const __m512i m = _mm512_set1_epi32(65535);
725 return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
727 inline v_int16x32
operator * (
const v_int16x32& a,
const v_int16x32& b)
729 __m512i pl = _mm512_mullo_epi16(a.val, b.val);
730 __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
731 __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
732 __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
733 return v_int16x32(_mm512_packs_epi32(p0, p1));
736 inline v_uint8x64&
operator *= (v_uint8x64& a,
const v_uint8x64& b)
737 { a = a * b;
return a; }
738 inline v_int8x64&
operator *= (v_int8x64& a,
const v_int8x64& b)
739 { a = a * b;
return a; }
740 inline v_uint16x32&
operator *= (v_uint16x32& a,
const v_uint16x32& b)
741 { a = a * b;
return a; }
742 inline v_int16x32&
operator *= (v_int16x32& a,
const v_int16x32& b)
743 { a = a * b;
return a; }
745 inline v_int16x32
v_mul_hi(
const v_int16x32& a,
const v_int16x32& b) {
return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
746 inline v_uint16x32
v_mul_hi(
const v_uint16x32& a,
const v_uint16x32& b) {
return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
749 inline void v_mul_expand(
const v_uint8x64& a,
const v_uint8x64& b,
750 v_uint16x32& c, v_uint16x32& d)
752 v_uint16x32 a0, a1, b0, b1;
755 c = v_mul_wrap(a0, b0);
756 d = v_mul_wrap(a1, b1);
759 inline void v_mul_expand(
const v_int8x64& a,
const v_int8x64& b,
760 v_int16x32& c, v_int16x32& d)
762 v_int16x32 a0, a1, b0, b1;
765 c = v_mul_wrap(a0, b0);
766 d = v_mul_wrap(a1, b1);
769 inline void v_mul_expand(
const v_int16x32& a,
const v_int16x32& b,
770 v_int32x16& c, v_int32x16& d)
775 c = v_reinterpret_as_s32(v0);
776 d = v_reinterpret_as_s32(v1);
779 inline void v_mul_expand(
const v_uint16x32& a,
const v_uint16x32& b,
780 v_uint32x16& c, v_uint32x16& d)
785 c = v_reinterpret_as_u32(v0);
786 d = v_reinterpret_as_u32(v1);
789 inline void v_mul_expand(
const v_uint32x16& a,
const v_uint32x16& b,
790 v_uint64x8& c, v_uint64x8& d)
792 v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
793 v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
796 inline void v_mul_expand(
const v_int32x16& a,
const v_int32x16& b,
797 v_int64x8& c, v_int64x8& d)
799 v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
800 v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
804 #define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
805 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
806 { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
807 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
808 { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
809 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
810 { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
811 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
812 { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \
814 inline _Tpuvec v_shl(const _Tpuvec& a) \
815 { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
817 inline _Tpsvec v_shl(const _Tpsvec& a) \
818 { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
820 inline _Tpuvec v_shr(const _Tpuvec& a) \
821 { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
823 inline _Tpsvec v_shr(const _Tpsvec& a) \
824 { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }
826 OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
827 OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
828 OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64)
832 #define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
833 OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \
834 OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \
835 OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \
836 inline _Tpvec operator ~ (const _Tpvec& a) \
837 { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
839 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1))
840 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64, si512, _mm512_set1_epi32(-1))
841 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32, si512, _mm512_set1_epi32(-1))
842 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32, si512, _mm512_set1_epi32(-1))
843 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16, si512, _mm512_set1_epi32(-1))
844 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16, si512, _mm512_set1_epi32(-1))
845 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8, si512, _mm512_set1_epi64(-1))
846 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8, si512, _mm512_set1_epi64(-1))
847 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps, _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
848 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8, pd, _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
851 #define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf) \
852 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
853 { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }
855 OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64, epi8, si512)
856 OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64, epi8, si512)
857 OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
858 OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32, epi16, si512)
859 OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
860 OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16, epi32, si512)
861 OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8, epi64, si512)
862 OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8, epi64, si512)
863 OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16, ps, ps)
864 OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd)
867 #define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
868 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
869 { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
871 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \
872 OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
873 OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
874 OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
875 OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
876 OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
877 OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
879 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (
char)-1)
880 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (
char)-1)
881 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (
short)-1)
882 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32, epi16, epi16, (
short)-1)
883 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (
int)-1)
884 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16, epi32, epi32, (
int)-1)
885 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (
int64)-1)
886 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (
int64)-1)
888 #define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
889 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
890 { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
892 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \
893 OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
894 OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
895 OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
896 OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
897 OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
898 OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
900 OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (
int)-1)
901 OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (
int64)-1)
903 inline v_float32x16
v_not_nan(const v_float32x16& a)
904 {
return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (
int)-1))); }
905 inline v_float64x8
v_not_nan(
const v_float64x8& a)
906 {
return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (
int64)-1))); }
909 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64, _mm512_min_epu8)
910 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64, _mm512_max_epu8)
911 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64, _mm512_min_epi8)
912 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64, _mm512_max_epi8)
913 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32, _mm512_min_epu16)
914 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32, _mm512_max_epu16)
915 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32, _mm512_min_epi16)
916 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32, _mm512_max_epi16)
917 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16, _mm512_min_epu32)
918 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16, _mm512_max_epu32)
919 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16, _mm512_min_epi32)
920 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16, _mm512_max_epi32)
921 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8, _mm512_min_epu64)
922 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8, _mm512_max_epu64)
923 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8, _mm512_min_epi64)
924 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8, _mm512_max_epi64)
925 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
926 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
927 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8, _mm512_min_pd)
928 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8, _mm512_max_pd)
932 template<
bool prec,
int imm4,
bool part,
int imm32>
933 struct _v_rotate_right {
static inline v_int8x64 eval(
const v_int8x64&,
const v_int8x64&) {
return v_int8x64(); }};
934 template<
int imm4,
int imm32>
935 struct _v_rotate_right<true, imm4, false, imm32> {
static inline v_int8x64 eval(
const v_int8x64& a,
const v_int8x64& b)
937 return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 ), imm4 *8),
938 _mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));
941 struct _v_rotate_right<true, imm4, false, 15> {
static inline v_int8x64 eval(
const v_int8x64& a,
const v_int8x64& b)
943 return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15), imm4 *8),
944 _mm512_slli_epi32( b.val, (4-imm4)*8)));
946 template<
int imm4,
int imm32>
947 struct _v_rotate_right<true, imm4, true, imm32> {
static inline v_int8x64 eval(
const v_int8x64&,
const v_int8x64& b)
949 return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16), imm4 *8),
950 _mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));
953 struct _v_rotate_right<true, imm4, true, 31> {
static inline v_int8x64 eval(
const v_int8x64&,
const v_int8x64& b)
954 {
return v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};
956 struct _v_rotate_right<false, 0, false, imm32> {
static inline v_int8x64 eval(
const v_int8x64& a,
const v_int8x64& b)
957 {
return v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};
959 struct _v_rotate_right<false, 0, false, 0> {
static inline v_int8x64 eval(
const v_int8x64& a,
const v_int8x64&) {
return a; }};
961 struct _v_rotate_right<false, 0, true, imm32> {
static inline v_int8x64 eval(
const v_int8x64&,
const v_int8x64& b)
962 {
return v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};
964 struct _v_rotate_right<false, 0, true, 16> {
static inline v_int8x64 eval(
const v_int8x64&,
const v_int8x64& b) {
return b; }};
966 struct _v_rotate_right<false, 0, true, 32> {
static inline v_int8x64 eval(
const v_int8x64&,
const v_int8x64&) {
return v_int8x64(); }};
968 template<
int imm>
inline v_int8x64 v_rotate_right(
const v_int8x64& a,
const v_int8x64& b)
970 return imm >= 128 ? v_int8x64() :
972 v_int8x64(_mm512_permutex2var_epi8(a.val,
973 _v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,
974 0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,
975 0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,
976 0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,
977 0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,
978 0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,
979 0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,
980 0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));
982 _v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);
986 inline v_int8x64 v_rotate_left(
const v_int8x64& a,
const v_int8x64& b)
988 if (imm == 0)
return a;
989 if (imm == 64)
return b;
990 if (imm >= 128)
return v_int8x64();
992 return v_int8x64(_mm512_permutex2var_epi8(b.val,
993 _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
994 0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
995 0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
996 0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
997 0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
998 0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
999 0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
1000 0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
1002 return imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);
1006 inline v_int8x64 v_rotate_right(
const v_int8x64& a)
1008 if (imm == 0)
return a;
1009 if (imm >= 64)
return v_int8x64();
1011 return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
1012 _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
1013 0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
1014 0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
1015 0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
1016 0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
1017 0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
1018 0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
1019 0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
1021 return v_rotate_right<imm>(a, v512_setzero_s8());
1025 inline v_int8x64 v_rotate_left(
const v_int8x64& a)
1027 if (imm == 0)
return a;
1028 if (imm >= 64)
return v_int8x64();
1030 return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
1031 _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
1032 0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
1033 0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
1034 0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
1035 0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
1036 0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
1037 0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
1038 0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
1040 return v_rotate_right<64 - imm>(v512_setzero_s8(), a);
1044 #define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix) \
1045 template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1046 { return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \
1047 template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1048 { return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \
1049 template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1050 { return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); } \
1051 template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1052 { return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }
1054 #define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix) \
1056 inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1058 enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \
1059 enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
1060 if (imm == 0) return a; \
1061 if (imm == _Tpvec::nlanes) return b; \
1062 if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
1063 return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \
1066 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1068 enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \
1069 enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
1070 if (imm == 0) return a; \
1071 if (imm == _Tpvec::nlanes) return b; \
1072 if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
1073 return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
1076 inline _Tpvec v_rotate_left(const _Tpvec& a) \
1078 if (imm == 0) return a; \
1079 if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
1080 return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
1083 inline _Tpvec v_rotate_right(const _Tpvec& a) \
1085 if (imm == 0) return a; \
1086 if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
1087 return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
1090 OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64, u8)
1091 OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32, u16)
1092 OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32, s16)
1093 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16, epi32)
1094 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16, epi32)
1095 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8, epi64)
1096 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8, epi64)
1097 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
1098 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8, pd)
1101 inline v_uint8x64
v_reverse(
const v_uint8x64 &a)
1104 static const __m512i perm = _mm512_set_epi32(
1105 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1106 0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
1107 0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
1108 0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
1109 return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
1111 static const __m512i shuf = _mm512_set_epi32(
1112 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1113 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1114 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1115 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
1116 static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
1117 __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
1118 return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
1122 inline v_int8x64
v_reverse(
const v_int8x64 &a)
1123 {
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1125 inline v_uint16x32
v_reverse(
const v_uint16x32 &a)
1128 static const __m512i perm = _mm512_set_epi32(
1129 0x00000001, 0x00020003, 0x00040005, 0x00060007,
1130 0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
1131 0x00100011, 0x00120013, 0x00140015, 0x00160017,
1132 0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
1133 return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
1135 static const __m512i shuf = _mm512_set_epi32(
1136 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1137 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1138 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1139 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
1140 static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
1141 __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
1142 return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
1146 inline v_int16x32
v_reverse(
const v_int16x32 &a)
1147 {
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
1149 inline v_uint32x16
v_reverse(
const v_uint32x16 &a)
1151 static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
1152 return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
1155 inline v_int32x16
v_reverse(
const v_int32x16 &a)
1156 {
return v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
1158 inline v_float32x16
v_reverse(
const v_float32x16 &a)
1159 {
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
1161 inline v_uint64x8
v_reverse(
const v_uint64x8 &a)
1163 static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
1164 return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
1167 inline v_int64x8
v_reverse(
const v_int64x8 &a)
1168 {
return v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
1170 inline v_float64x8
v_reverse(
const v_float64x8 &a)
1171 {
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
1176 #define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
1177 #define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop) \
1178 inline sctype v_reduce_##func(const _Tpvec& a) \
1179 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1180 sctype CV_DECL_ALIGNED(64) idx[2]; \
1181 _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
1182 return scop(idx[0], idx[1]); }
1183 OPENCV_HAL_IMPL_AVX512_REDUCE_8(
uint64,
min, v_uint64x8, min_epu64,
min)
1184 OPENCV_HAL_IMPL_AVX512_REDUCE_8(
uint64,
max, v_uint64x8, max_epu64,
max)
1185 OPENCV_HAL_IMPL_AVX512_REDUCE_8(
uint64,
sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1186 OPENCV_HAL_IMPL_AVX512_REDUCE_8(
int64,
min, v_int64x8, min_epi64,
min)
1187 OPENCV_HAL_IMPL_AVX512_REDUCE_8(
int64,
max, v_int64x8, max_epi64,
max)
1188 OPENCV_HAL_IMPL_AVX512_REDUCE_8(
int64,
sum, v_int64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1190 #define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop) \
1191 inline double v_reduce_##func(const v_float64x8& a) \
1192 { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1193 double CV_DECL_ALIGNED(64) idx[2]; \
1194 _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
1195 return scop(idx[0], idx[1]); }
1196 OPENCV_HAL_IMPL_AVX512_REDUCE_8F(
min, min_pd,
min)
1197 OPENCV_HAL_IMPL_AVX512_REDUCE_8F(
max, max_pd,
max)
1198 OPENCV_HAL_IMPL_AVX512_REDUCE_8F(
sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1200 #define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc) \
1201 inline sctype v_reduce_##func(const _Tpvec& a) \
1202 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1203 __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1204 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1205 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1206 return (sctype)_mm_cvtsi128_si32(quarter); }
1207 OPENCV_HAL_IMPL_AVX512_REDUCE_16(
uint,
min, v_uint32x16, min_epu32)
1208 OPENCV_HAL_IMPL_AVX512_REDUCE_16(
uint,
max, v_uint32x16, max_epu32)
1209 OPENCV_HAL_IMPL_AVX512_REDUCE_16(
int,
min, v_int32x16, min_epi32)
1210 OPENCV_HAL_IMPL_AVX512_REDUCE_16(
int,
max, v_int32x16, max_epi32)
1212 #define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc) \
1213 inline float v_reduce_##func(const v_float32x16& a) \
1214 { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1215 __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
1216 quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2))); \
1217 quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1))); \
1218 return _mm_cvtss_f32(quarter); }
1219 OPENCV_HAL_IMPL_AVX512_REDUCE_16F(
min, min_ps)
1220 OPENCV_HAL_IMPL_AVX512_REDUCE_16F(
max, max_ps)
1224 __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
1225 __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
1226 quarter = _mm_hadd_ps(quarter, quarter);
1227 return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
1231 __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
1232 __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1233 quarter = _mm_hadd_epi32(quarter, quarter);
1234 return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
1239 #define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc) \
1240 inline sctype v_reduce_##func(const _Tpvec& a) \
1241 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1242 __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1243 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1244 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1245 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \
1246 return (sctype)_mm_cvtsi128_si32(quarter); }
1247 OPENCV_HAL_IMPL_AVX512_REDUCE_32(
ushort,
min, v_uint16x32, min_epu16)
1248 OPENCV_HAL_IMPL_AVX512_REDUCE_32(
ushort,
max, v_uint16x32, max_epu16)
1249 OPENCV_HAL_IMPL_AVX512_REDUCE_32(
short,
min, v_int16x32, min_epi16)
1250 OPENCV_HAL_IMPL_AVX512_REDUCE_32(
short,
max, v_int16x32, max_epi16)
1257 #define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \
1258 inline sctype v_reduce_##func(const _Tpvec& a) \
1259 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1260 __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1261 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1262 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1263 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \
1264 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1)); \
1265 return (sctype)_mm_cvtsi128_si32(quarter); }
1266 OPENCV_HAL_IMPL_AVX512_REDUCE_64(
uchar,
min, v_uint8x64, min_epu8)
1267 OPENCV_HAL_IMPL_AVX512_REDUCE_64(
uchar,
max, v_uint8x64, max_epu8)
1268 OPENCV_HAL_IMPL_AVX512_REDUCE_64(
schar,
min, v_int8x64, min_epi8)
1269 OPENCV_HAL_IMPL_AVX512_REDUCE_64(
schar,
max, v_int8x64, max_epi8)
1271 #define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix) \
1272 inline sctype v_reduce_sum(const _Tpvec& a) \
1273 { __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)), \
1274 _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val))); \
1275 a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
1276 __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16)); \
1277 __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1)); \
1278 a4 = _mm_hadd_epi32(a4, a4); \
1279 return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
1280 OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(
uint, v_uint8x64, epu8)
1281 OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(
int, v_int8x64, epi8)
1283 inline v_float32x16
v_reduce_sum4(
const v_float32x16& a,
const v_float32x16& b,
1284 const v_float32x16& c,
const v_float32x16& d)
1286 __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
1287 __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
1288 __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
1289 __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
1290 return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
1293 inline unsigned v_reduce_sad(
const v_uint8x64& a,
const v_uint8x64& b)
1295 __m512i val = _mm512_sad_epu8(a.val, b.val);
1296 __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
1297 __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1298 return (
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1300 inline unsigned v_reduce_sad(
const v_int8x64& a,
const v_int8x64& b)
1302 __m512i val = _mm512_set1_epi8(-128);
1303 val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
1304 __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
1305 __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1306 return (
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1308 inline unsigned v_reduce_sad(
const v_uint16x32& a,
const v_uint16x32& b)
1310 inline unsigned v_reduce_sad(
const v_int16x32& a,
const v_int16x32& b)
1311 {
return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
1312 inline unsigned v_reduce_sad(
const v_uint32x16& a,
const v_uint32x16& b)
1314 inline unsigned v_reduce_sad(
const v_int32x16& a,
const v_int32x16& b)
1315 {
return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
1316 inline float v_reduce_sad(
const v_float32x16& a,
const v_float32x16& b)
1317 {
return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
1318 inline double v_reduce_sad(
const v_float64x8& a,
const v_float64x8& b)
1319 {
return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
1322 inline v_uint8x64
v_popcount(
const v_int8x64& a)
1324 #if CV_AVX_512BITALG
1325 return v_uint8x64(_mm512_popcnt_epi8(a.val));
1326 #elif CV_AVX_512VBMI
1327 __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
1328 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
1329 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
1330 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
1331 __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
1332 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
1333 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
1334 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
1335 return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
1337 __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
1338 __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);
1340 return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512( a.val, _popcnt_mask)),
1341 _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
1344 inline v_uint16x32
v_popcount(
const v_int16x32& a)
1346 #if CV_AVX_512BITALG
1347 return v_uint16x32(_mm512_popcnt_epi16(a.val));
1348 #elif CV_AVX_512VPOPCNTDQ
1349 __m512i zero = _mm512_setzero_si512();
1350 return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
1351 _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
1353 v_uint8x64 p =
v_popcount(v_reinterpret_as_s8(a));
1354 p += v_rotate_right<1>(p);
1355 return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
1358 inline v_uint32x16
v_popcount(
const v_int32x16& a)
1360 #if CV_AVX_512VPOPCNTDQ
1361 return v_uint32x16(_mm512_popcnt_epi32(a.val));
1363 v_uint8x64 p =
v_popcount(v_reinterpret_as_s8(a));
1364 p += v_rotate_right<1>(p);
1365 p += v_rotate_right<2>(p);
1366 return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
1369 inline v_uint64x8
v_popcount(
const v_int64x8& a)
1371 #if CV_AVX_512VPOPCNTDQ
1372 return v_uint64x8(_mm512_popcnt_epi64(a.val));
1374 return v_uint64x8(_mm512_sad_epu8(
v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
1379 inline v_uint8x64
v_popcount(
const v_uint8x64& a) {
return v_popcount(v_reinterpret_as_s8 (a)); }
1380 inline v_uint16x32
v_popcount(
const v_uint16x32& a) {
return v_popcount(v_reinterpret_as_s16(a)); }
1381 inline v_uint32x16
v_popcount(
const v_uint32x16& a) {
return v_popcount(v_reinterpret_as_s32(a)); }
1382 inline v_uint64x8
v_popcount(
const v_uint64x8& a) {
return v_popcount(v_reinterpret_as_s64(a)); }
1389 #define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix) \
1390 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1391 { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); } \
1392 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1393 { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }
1395 #define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix) \
1396 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1397 { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); } \
1398 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1399 { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); }
1402 #define OPENCV_HAL_IMPL_AVX512_MISC(_Tpvec, suffix) \
1403 inline _Tpvec v_sqrt(const _Tpvec& x) \
1404 { return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \
1405 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1406 { return v_fma(a, a, b * b); } \
1407 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1408 { return v_sqrt(v_fma(a, a, b * b)); }
1410 OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
1411 OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd)
1412 OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
1413 OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8, pd)
1415 inline v_int32x16
v_fma(
const v_int32x16& a,
const v_int32x16& b,
const v_int32x16& c)
1416 {
return a * b + c; }
1417 inline v_int32x16
v_muladd(
const v_int32x16& a,
const v_int32x16& b,
const v_int32x16& c)
1418 {
return v_fma(a, b, c); }
1420 inline v_float32x16
v_invsqrt(
const v_float32x16&
x)
1423 return v_float32x16(_mm512_rsqrt28_ps(
x.val));
1425 v_float32x16 half =
x * v512_setall_f32(0.5);
1426 v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(
x.val));
1427 t *= v512_setall_f32(1.5) - ((t * t) * half);
1432 inline v_float64x8
v_invsqrt(
const v_float64x8&
x)
1435 return v_float64x8(_mm512_rsqrt28_pd(
x.val));
1437 return v512_setall_f64(1.) / v_sqrt(
x);
1447 #define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
1448 inline _Tpuvec v_abs(const _Tpvec& x) \
1449 { return _Tpuvec(_mm512_abs_##suffix(x.val)); }
1451 OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64, v_uint8x64, epi8)
1452 OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32, v_uint16x32, epi16)
1453 OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16, v_uint32x16, epi32)
1454 OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8, v_uint64x8, epi64)
1456 inline v_float32x16 v_abs(
const v_float32x16&
x)
1458 #ifdef _mm512_abs_pd
1459 return v_float32x16(_mm512_abs_ps(
x.val));
1461 return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(
x.val),
1462 _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
1463 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
1467 inline v_float64x8 v_abs(
const v_float64x8&
x)
1469 #ifdef _mm512_abs_pd
1470 #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
1472 return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(
x.val)));
1474 return v_float64x8(_mm512_abs_pd(
x.val));
1477 return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(
x.val),
1478 _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
1479 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
1484 inline v_uint8x64
v_absdiff(
const v_uint8x64& a,
const v_uint8x64& b)
1485 {
return v_add_wrap(a - b, b - a); }
1486 inline v_uint16x32
v_absdiff(
const v_uint16x32& a,
const v_uint16x32& b)
1487 {
return v_add_wrap(a - b, b - a); }
1488 inline v_uint32x16
v_absdiff(
const v_uint32x16& a,
const v_uint32x16& b)
1489 {
return v_max(a, b) - v_min(a, b); }
1491 inline v_uint8x64
v_absdiff(
const v_int8x64& a,
const v_int8x64& b)
1493 v_int8x64 d = v_sub_wrap(a, b);
1494 v_int8x64 m = a < b;
1495 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1498 inline v_uint16x32
v_absdiff(
const v_int16x32& a,
const v_int16x32& b)
1499 {
return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1501 inline v_uint32x16
v_absdiff(
const v_int32x16& a,
const v_int32x16& b)
1503 v_int32x16 d = a - b;
1504 v_int32x16 m = a < b;
1505 return v_reinterpret_as_u32((d ^ m) - m);
1508 inline v_float32x16
v_absdiff(
const v_float32x16& a,
const v_float32x16& b)
1509 {
return v_abs(a - b); }
1511 inline v_float64x8
v_absdiff(
const v_float64x8& a,
const v_float64x8& b)
1512 {
return v_abs(a - b); }
1515 inline v_int8x64
v_absdiffs(
const v_int8x64& a,
const v_int8x64& b)
1517 v_int8x64 d = a - b;
1518 v_int8x64 m = a < b;
1521 inline v_int16x32
v_absdiffs(
const v_int16x32& a,
const v_int16x32& b)
1522 {
return v_max(a, b) - v_min(a, b); }
1527 inline v_int32x16
v_round(
const v_float32x16& a)
1528 {
return v_int32x16(_mm512_cvtps_epi32(a.val)); }
1530 inline v_int32x16
v_round(
const v_float64x8& a)
1531 {
return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }
1533 inline v_int32x16
v_round(
const v_float64x8& a,
const v_float64x8& b)
1534 {
return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }
1536 inline v_int32x16
v_trunc(
const v_float32x16& a)
1537 {
return v_int32x16(_mm512_cvttps_epi32(a.val)); }
1539 inline v_int32x16
v_trunc(
const v_float64x8& a)
1540 {
return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }
1542 #if CVT_ROUND_MODES_IMPLEMENTED
1543 inline v_int32x16
v_floor(
const v_float32x16& a)
1544 {
return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }
1546 inline v_int32x16
v_floor(
const v_float64x8& a)
1547 {
return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }
1549 inline v_int32x16
v_ceil(
const v_float32x16& a)
1550 {
return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
1552 inline v_int32x16
v_ceil(
const v_float64x8& a)
1553 {
return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
1555 inline v_int32x16
v_floor(
const v_float32x16& a)
1556 {
return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }
1558 inline v_int32x16
v_floor(
const v_float64x8& a)
1559 {
return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }
1561 inline v_int32x16
v_ceil(
const v_float32x16& a)
1562 {
return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }
1564 inline v_int32x16
v_ceil(
const v_float64x8& a)
1565 {
return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
1569 inline v_float32x16
v_cvt_f32(
const v_int32x16& a)
1570 {
return v_float32x16(_mm512_cvtepi32_ps(a.val)); }
1572 inline v_float32x16
v_cvt_f32(
const v_float64x8& a)
1573 {
return v_float32x16(_mm512_cvtpd_pslo(a.val)); }
1575 inline v_float32x16
v_cvt_f32(
const v_float64x8& a,
const v_float64x8& b)
1576 {
return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }
1578 inline v_float64x8
v_cvt_f64(
const v_int32x16& a)
1579 {
return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }
1582 {
return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }
1584 inline v_float64x8
v_cvt_f64(
const v_float32x16& a)
1585 {
return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }
1588 {
return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
1591 inline v_float64x8
v_cvt_f64(
const v_int64x8& v)
1594 return v_float64x8(_mm512_cvtepi64_pd(v.val));
1597 __m512i magic_i_lo = _mm512_set1_epi64(0x4330000000000000);
1598 __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000);
1599 __m512i magic_i_all = _mm512_set1_epi64(0x4530000080100000);
1600 __m512d magic_d_all = _mm512_castsi512_pd(magic_i_all);
1603 __m512i v_lo = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
1605 __m512i v_hi = _mm512_srli_epi64(v.val, 32);
1607 v_hi = _mm512_xor_si512(v_hi, magic_i_hi32);
1609 __m512d v_hi_dbl = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
1611 __m512d
result = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
1612 return v_float64x8(
result);
1618 inline v_int8x64 v512_lut(
const schar* tab,
const int*
idx)
1620 __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx ), (
const int *)tab, 1));
1621 __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx + 1), (
const int *)tab, 1));
1622 __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx + 2), (
const int *)tab, 1));
1623 __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx + 3), (
const int *)tab, 1));
1624 return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
1626 inline v_int8x64 v512_lut_pairs(
const schar* tab,
const int*
idx)
1628 __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx ), (
const int *)tab, 1));
1629 __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx + 1), (
const int *)tab, 1));
1630 return v_int8x64(_v512_combine(p0, p1));
1632 inline v_int8x64 v512_lut_quads(
const schar* tab,
const int*
idx)
1634 return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx), (
const int *)tab, 1));
1636 inline v_uint8x64 v512_lut(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v512_lut((
const schar *)tab,
idx)); }
1637 inline v_uint8x64 v512_lut_pairs(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v512_lut_pairs((
const schar *)tab,
idx)); }
1638 inline v_uint8x64 v512_lut_quads(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v512_lut_quads((
const schar *)tab,
idx)); }
1640 inline v_int16x32 v512_lut(
const short* tab,
const int*
idx)
1642 __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx ), (
const int *)tab, 2));
1643 __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx + 1), (
const int *)tab, 2));
1644 return v_int16x32(_v512_combine(p0, p1));
1646 inline v_int16x32 v512_lut_pairs(
const short* tab,
const int*
idx)
1648 return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx), (
const int *)tab, 2));
1650 inline v_int16x32 v512_lut_quads(
const short* tab,
const int*
idx)
1652 #if defined(__GNUC__)
1653 return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((
const __m256i*)
idx), (
const long long int*)tab, 2));
1655 return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((
const __m256i*)
idx), (
const int64*)tab, 2));
1658 inline v_uint16x32 v512_lut(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v512_lut((
const short *)tab,
idx)); }
1659 inline v_uint16x32 v512_lut_pairs(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v512_lut_pairs((
const short *)tab,
idx)); }
1660 inline v_uint16x32 v512_lut_quads(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v512_lut_quads((
const short *)tab,
idx)); }
1662 inline v_int32x16 v512_lut(
const int* tab,
const int*
idx)
1664 return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((
const __m512i*)
idx), tab, 4));
1666 inline v_int32x16 v512_lut_pairs(
const int* tab,
const int*
idx)
1668 #if defined(__GNUC__)
1669 return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((
const __m256i*)
idx), (
const long long int*)tab, 4));
1671 return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((
const __m256i*)
idx), (
const int64*)tab, 4));
1674 inline v_int32x16 v512_lut_quads(
const int* tab,
const int*
idx)
1676 return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
1677 _mm_loadu_si128((
const __m128i*)(tab +
idx[0]))),
1678 _mm_loadu_si128((
const __m128i*)(tab +
idx[1])), 1),
1679 _mm_loadu_si128((
const __m128i*)(tab +
idx[2])), 2),
1680 _mm_loadu_si128((
const __m128i*)(tab +
idx[3])), 3));
1682 inline v_uint32x16 v512_lut(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v512_lut((
const int *)tab,
idx)); }
1683 inline v_uint32x16 v512_lut_pairs(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v512_lut_pairs((
const int *)tab,
idx)); }
1684 inline v_uint32x16 v512_lut_quads(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v512_lut_quads((
const int *)tab,
idx)); }
1686 inline v_int64x8 v512_lut(
const int64* tab,
const int*
idx)
1688 #if defined(__GNUC__)
1689 return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((
const __m256i*)
idx), (
const long long int*)tab, 8));
1691 return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((
const __m256i*)
idx), tab , 8));
1694 inline v_int64x8 v512_lut_pairs(
const int64* tab,
const int*
idx)
1696 return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
1697 _mm_loadu_si128((
const __m128i*)(tab +
idx[0]))),
1698 _mm_loadu_si128((
const __m128i*)(tab +
idx[1])), 1),
1699 _mm_loadu_si128((
const __m128i*)(tab +
idx[2])), 2),
1700 _mm_loadu_si128((
const __m128i*)(tab +
idx[3])), 3));
1702 inline v_uint64x8 v512_lut(
const uint64* tab,
const int*
idx) {
return v_reinterpret_as_u64(v512_lut((
const int64 *)tab,
idx)); }
1703 inline v_uint64x8 v512_lut_pairs(
const uint64* tab,
const int*
idx) {
return v_reinterpret_as_u64(v512_lut_pairs((
const int64 *)tab,
idx)); }
1705 inline v_float32x16 v512_lut(
const float* tab,
const int*
idx)
1707 return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((
const __m512i*)
idx), tab, 4));
1709 inline v_float32x16 v512_lut_pairs(
const float* tab,
const int*
idx) {
return v_reinterpret_as_f32(v512_lut_pairs((
const int *)tab,
idx)); }
1710 inline v_float32x16 v512_lut_quads(
const float* tab,
const int*
idx) {
return v_reinterpret_as_f32(v512_lut_quads((
const int *)tab,
idx)); }
1712 inline v_float64x8 v512_lut(
const double* tab,
const int*
idx)
1714 return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((
const __m256i*)
idx), tab, 8));
1716 inline v_float64x8 v512_lut_pairs(
const double* tab,
const int*
idx)
1718 return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
1719 _mm_loadu_pd(tab +
idx[0])),
1720 _mm_loadu_pd(tab +
idx[1]), 1),
1721 _mm_loadu_pd(tab +
idx[2]), 2),
1722 _mm_loadu_pd(tab +
idx[3]), 3));
1725 inline v_int32x16
v_lut(
const int* tab,
const v_int32x16& idxvec)
1727 return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
1730 inline v_uint32x16
v_lut(
const unsigned* tab,
const v_int32x16& idxvec)
1732 return v_reinterpret_as_u32(
v_lut((
const int *)tab, idxvec));
1735 inline v_float32x16
v_lut(
const float* tab,
const v_int32x16& idxvec)
1737 return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
1740 inline v_float64x8
v_lut(
const double* tab,
const v_int32x16& idxvec)
1742 return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
1745 inline void v_lut_deinterleave(
const float* tab,
const v_int32x16& idxvec, v_float32x16&
x, v_float32x16&
y)
1747 x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
1748 y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);
1751 inline void v_lut_deinterleave(
const double* tab,
const v_int32x16& idxvec, v_float64x8&
x, v_float64x8&
y)
1753 x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
1754 y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);
1759 return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
1764 return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
1770 return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
1775 return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
1781 return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
1788 return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1789 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
1790 _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
1796 return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
1797 0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
1803 return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1804 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
1809 return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1810 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
1818 inline v_int32x16
v_dotprod(
const v_int16x32& a,
const v_int16x32& b)
1819 {
return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
1820 inline v_int32x16
v_dotprod(
const v_int16x32& a,
const v_int16x32& b,
const v_int32x16& c)
1824 inline v_int64x8
v_dotprod(
const v_int32x16& a,
const v_int32x16& b)
1826 __m512i even = _mm512_mul_epi32(a.val, b.val);
1827 __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
1828 return v_int64x8(_mm512_add_epi64(even, odd));
1830 inline v_int64x8
v_dotprod(
const v_int32x16& a,
const v_int32x16& b,
const v_int64x8& c)
1834 inline v_uint32x16
v_dotprod_expand(
const v_uint8x64& a,
const v_uint8x64& b)
1836 __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
1837 __m512i odd_a = _mm512_srli_epi16(a.val, 8);
1839 __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
1840 __m512i odd_b = _mm512_srli_epi16(b.val, 8);
1842 __m512i prod0 = _mm512_madd_epi16(even_a, even_b);
1843 __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
1844 return v_uint32x16(_mm512_add_epi32(prod0, prod1));
1846 inline v_uint32x16
v_dotprod_expand(
const v_uint8x64& a,
const v_uint8x64& b,
const v_uint32x16& c)
1851 __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
1852 __m512i odd_a = _mm512_srai_epi16(a.val, 8);
1854 __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
1855 __m512i odd_b = _mm512_srai_epi16(b.val, 8);
1857 __m512i prod0 = _mm512_madd_epi16(even_a, even_b);
1858 __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
1859 return v_int32x16(_mm512_add_epi32(prod0, prod1));
1861 inline v_int32x16
v_dotprod_expand(
const v_int8x64& a,
const v_int8x64& b,
const v_int32x16& c)
1865 inline v_uint64x8
v_dotprod_expand(
const v_uint16x32& a,
const v_uint16x32& b)
1867 __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
1868 __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
1869 __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
1870 __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
1872 __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
1873 __m512i p13 = _mm512_srli_epi64(mul0, 32);
1874 __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
1875 __m512i p57 = _mm512_srli_epi64(mul1, 32);
1877 __m512i p15_ = _mm512_add_epi64(p02, p13);
1878 __m512i p9d_ = _mm512_add_epi64(p46, p57);
1880 return v_uint64x8(_mm512_add_epi64(
1881 _mm512_unpacklo_epi64(p15_, p9d_),
1882 _mm512_unpackhi_epi64(p15_, p9d_)
1885 inline v_uint64x8
v_dotprod_expand(
const v_uint16x32& a,
const v_uint16x32& b,
const v_uint64x8& c)
1888 inline v_int64x8
v_dotprod_expand(
const v_int16x32& a,
const v_int16x32& b)
1890 __m512i prod = _mm512_madd_epi16(a.val, b.val);
1891 __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
1892 __m512i odd = _mm512_srai_epi64(prod, 32);
1893 return v_int64x8(_mm512_add_epi64(even, odd));
1895 inline v_int64x8
v_dotprod_expand(
const v_int16x32& a,
const v_int16x32& b,
const v_int64x8& c)
1899 inline v_float64x8
v_dotprod_expand(
const v_int32x16& a,
const v_int32x16& b)
1901 inline v_float64x8
v_dotprod_expand(
const v_int32x16& a,
const v_int32x16& b,
const v_float64x8& c)
1907 inline v_int32x16
v_dotprod_fast(
const v_int16x32& a,
const v_int16x32& b)
1909 inline v_int32x16
v_dotprod_fast(
const v_int16x32& a,
const v_int16x32& b,
const v_int32x16& c)
1913 inline v_int64x8
v_dotprod_fast(
const v_int32x16& a,
const v_int32x16& b)
1915 inline v_int64x8
v_dotprod_fast(
const v_int32x16& a,
const v_int32x16& b,
const v_int64x8& c)
1921 inline v_uint32x16
v_dotprod_expand_fast(
const v_uint8x64& a,
const v_uint8x64& b,
const v_uint32x16& c)
1926 inline v_int32x16
v_dotprod_expand_fast(
const v_int8x64& a,
const v_int8x64& b,
const v_int32x16& c)
1932 __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
1933 __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
1934 __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
1935 __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
1937 __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
1938 __m512i p13 = _mm512_srli_epi64(mul0, 32);
1939 __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
1940 __m512i p57 = _mm512_srli_epi64(mul1, 32);
1942 __m512i p15_ = _mm512_add_epi64(p02, p13);
1943 __m512i p9d_ = _mm512_add_epi64(p46, p57);
1944 return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
1946 inline v_uint64x8
v_dotprod_expand_fast(
const v_uint16x32& a,
const v_uint16x32& b,
const v_uint64x8& c)
1951 inline v_int64x8
v_dotprod_expand_fast(
const v_int16x32& a,
const v_int16x32& b,
const v_int64x8& c)
1957 inline v_float64x8
v_dotprod_expand_fast(
const v_int32x16& a,
const v_int32x16& b,
const v_float64x8& c)
1961 #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
1962 v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1964 inline v_float32x16
v_matmul(
const v_float32x16& v,
1965 const v_float32x16& m0,
const v_float32x16& m1,
1966 const v_float32x16& m2,
const v_float32x16& m3)
1968 v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
1969 v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
1970 v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
1971 v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
1975 inline v_float32x16
v_matmuladd(
const v_float32x16& v,
1976 const v_float32x16& m0,
const v_float32x16& m1,
1977 const v_float32x16& m2,
const v_float32x16& a)
1979 v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
1980 v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
1981 v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
1985 #define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1986 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1987 const _Tpvec& a2, const _Tpvec& a3, \
1988 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1990 __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val)); \
1991 __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val)); \
1992 __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val)); \
1993 __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val)); \
1994 b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1)); \
1995 b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1)); \
1996 b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3)); \
1997 b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3)); \
2000 OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2001 OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2002 OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)
2007 #define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
2008 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2010 b0.val = intrin(_v512_extract_low(a.val)); \
2011 b1.val = intrin(_v512_extract_high(a.val)); \
2013 inline _Tpwvec v_expand_low(const _Tpvec& a) \
2014 { return _Tpwvec(intrin(_v512_extract_low(a.val))); } \
2015 inline _Tpwvec v_expand_high(const _Tpvec& a) \
2016 { return _Tpwvec(intrin(_v512_extract_high(a.val))); } \
2017 inline _Tpwvec v512_load_expand(const _Tp* ptr) \
2019 __m256i a = _mm256_loadu_si256((const __m256i*)ptr); \
2020 return _Tpwvec(intrin(a)); \
2023 OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64, v_uint16x32,
uchar, _mm512_cvtepu8_epi16)
2024 OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64, v_int16x32,
schar, _mm512_cvtepi8_epi16)
2025 OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16,
ushort, _mm512_cvtepu16_epi32)
2026 OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32, v_int32x16,
short, _mm512_cvtepi16_epi32)
2027 OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8,
unsigned, _mm512_cvtepu32_epi64)
2028 OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16, v_int64x8,
int, _mm512_cvtepi32_epi64)
2030 #define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
2031 inline _Tpvec v512_load_expand_q(const _Tp* ptr) \
2033 __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
2034 return _Tpvec(intrin(a)); \
2037 OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16,
uchar, _mm512_cvtepu8_epi32)
2038 OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16,
schar, _mm512_cvtepi8_epi32)
2042 inline v_int8x64 v_pack(
const v_int16x32& a,
const v_int16x32& b)
2043 {
return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
2045 inline v_uint8x64 v_pack(
const v_uint16x32& a,
const v_uint16x32& b)
2047 const __m512i t = _mm512_set1_epi16(255);
2048 return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
2051 inline v_uint8x64 v_pack_u(
const v_int16x32& a,
const v_int16x32& b)
2053 return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
2061 const __m512i m = _mm512_set1_epi16(255);
2062 _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
2065 inline void v_pack_u_store(
uchar* ptr,
const v_int16x32& a)
2068 template<
int n>
inline
2069 v_uint8x64 v_rshr_pack(
const v_uint16x32& a,
const v_uint16x32& b)
2072 v_uint16x32
delta = v512_setall_u16((
short)(1 << (n-1)));
2073 return v_pack_u(v_reinterpret_as_s16((a +
delta) >> n),
2074 v_reinterpret_as_s16((b +
delta) >> n));
2077 template<
int n>
inline
2078 void v_rshr_pack_store(
uchar* ptr,
const v_uint16x32& a)
2080 v_uint16x32
delta = v512_setall_u16((
short)(1 << (n-1)));
2081 v_pack_u_store(ptr, v_reinterpret_as_s16((a +
delta) >> n));
2084 template<
int n>
inline
2085 v_uint8x64 v_rshr_pack_u(
const v_int16x32& a,
const v_int16x32& b)
2087 v_int16x32
delta = v512_setall_s16((
short)(1 << (n-1)));
2088 return v_pack_u((a +
delta) >> n, (b +
delta) >> n);
2091 template<
int n>
inline
2092 void v_rshr_pack_u_store(
uchar* ptr,
const v_int16x32& a)
2094 v_int16x32
delta = v512_setall_s16((
short)(1 << (n-1)));
2095 v_pack_u_store(ptr, (a +
delta) >> n);
2098 template<
int n>
inline
2099 v_int8x64 v_rshr_pack(
const v_int16x32& a,
const v_int16x32& b)
2101 v_int16x32
delta = v512_setall_s16((
short)(1 << (n-1)));
2102 return v_pack((a +
delta) >> n, (b +
delta) >> n);
2105 template<
int n>
inline
2106 void v_rshr_pack_store(
schar* ptr,
const v_int16x32& a)
2108 v_int16x32
delta = v512_setall_s16((
short)(1 << (n-1)));
2113 inline v_int16x32 v_pack(
const v_int32x16& a,
const v_int32x16& b)
2114 {
return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
2116 inline v_uint16x32 v_pack(
const v_uint32x16& a,
const v_uint32x16& b)
2118 const __m512i m = _mm512_set1_epi32(65535);
2119 return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
2122 inline v_uint16x32 v_pack_u(
const v_int32x16& a,
const v_int32x16& b)
2123 {
return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
2125 inline void v_pack_store(
short* ptr,
const v_int32x16& a)
2130 const __m512i m = _mm512_set1_epi32(65535);
2131 _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
2134 inline void v_pack_u_store(
ushort* ptr,
const v_int32x16& a)
2138 template<
int n>
inline
2139 v_uint16x32 v_rshr_pack(
const v_uint32x16& a,
const v_uint32x16& b)
2141 v_uint32x16
delta = v512_setall_u32(1 << (n-1));
2142 return v_pack_u(v_reinterpret_as_s32((a +
delta) >> n),
2143 v_reinterpret_as_s32((b +
delta) >> n));
2146 template<
int n>
inline
2147 void v_rshr_pack_store(
ushort* ptr,
const v_uint32x16& a)
2149 v_uint32x16
delta = v512_setall_u32(1 << (n-1));
2150 v_pack_u_store(ptr, v_reinterpret_as_s32((a +
delta) >> n));
2153 template<
int n>
inline
2154 v_uint16x32 v_rshr_pack_u(
const v_int32x16& a,
const v_int32x16& b)
2156 v_int32x16
delta = v512_setall_s32(1 << (n-1));
2157 return v_pack_u((a +
delta) >> n, (b +
delta) >> n);
2160 template<
int n>
inline
2161 void v_rshr_pack_u_store(
ushort* ptr,
const v_int32x16& a)
2163 v_int32x16
delta = v512_setall_s32(1 << (n-1));
2164 v_pack_u_store(ptr, (a +
delta) >> n);
2167 template<
int n>
inline
2168 v_int16x32 v_rshr_pack(
const v_int32x16& a,
const v_int32x16& b)
2170 v_int32x16
delta = v512_setall_s32(1 << (n-1));
2171 return v_pack((a +
delta) >> n, (b +
delta) >> n);
2174 template<
int n>
inline
2175 void v_rshr_pack_store(
short* ptr,
const v_int32x16& a)
2177 v_int32x16
delta = v512_setall_s32(1 << (n-1));
2183 inline v_uint32x16 v_pack(
const v_uint64x8& a,
const v_uint64x8& b)
2184 {
return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }
2186 inline v_int32x16 v_pack(
const v_int64x8& a,
const v_int64x8& b)
2187 {
return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2189 inline void v_pack_store(
unsigned* ptr,
const v_uint64x8& a)
2190 { _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }
2193 {
v_pack_store((
unsigned*)ptr, v_reinterpret_as_u64(b)); }
2195 template<
int n>
inline
2196 v_uint32x16 v_rshr_pack(
const v_uint64x8& a,
const v_uint64x8& b)
2198 v_uint64x8
delta = v512_setall_u64((
uint64)1 << (n-1));
2199 return v_pack((a +
delta) >> n, (b +
delta) >> n);
2202 template<
int n>
inline
2203 void v_rshr_pack_store(
unsigned* ptr,
const v_uint64x8& a)
2205 v_uint64x8
delta = v512_setall_u64((
uint64)1 << (n-1));
2209 template<
int n>
inline
2210 v_int32x16 v_rshr_pack(
const v_int64x8& a,
const v_int64x8& b)
2212 v_int64x8
delta = v512_setall_s64((
int64)1 << (n-1));
2213 return v_pack((a +
delta) >> n, (b +
delta) >> n);
2216 template<
int n>
inline
2217 void v_rshr_pack_store(
int* ptr,
const v_int64x8& a)
2219 v_int64x8
delta = v512_setall_s64((
int64)1 << (n-1));
2224 inline v_uint8x64
v_pack_b(
const v_uint16x32& a,
const v_uint16x32& b)
2225 {
return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
2227 inline v_uint8x64
v_pack_b(
const v_uint32x16& a,
const v_uint32x16& b,
2228 const v_uint32x16& c,
const v_uint32x16& d)
2230 __m512i ab = _mm512_packs_epi32(a.val, b.val);
2231 __m512i cd = _mm512_packs_epi32(c.val, d.val);
2233 return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
2236 inline v_uint8x64
v_pack_b(
const v_uint64x8& a,
const v_uint64x8& b,
const v_uint64x8& c,
2237 const v_uint64x8& d,
const v_uint64x8& e,
const v_uint64x8& f,
2238 const v_uint64x8& g,
const v_uint64x8& h)
2240 __m512i ab = _mm512_packs_epi32(a.val, b.val);
2241 __m512i cd = _mm512_packs_epi32(c.val, d.val);
2242 __m512i ef = _mm512_packs_epi32(e.val, f.val);
2243 __m512i gh = _mm512_packs_epi32(g.val, h.val);
2245 __m512i abcd = _mm512_packs_epi32(ab, cd);
2246 __m512i efgh = _mm512_packs_epi32(ef, gh);
2248 return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
2249 27, 19, 11, 3, 26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0), _mm512_packs_epi16(abcd, efgh)));
2256 #define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec) \
2258 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2259 { return v_rotate_right<s>(a, b); }
2261 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
2262 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
2263 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
2264 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
2265 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
2266 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
2267 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
2268 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
2269 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
2270 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)
2272 #define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
2273 template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }
2275 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64,
uchar)
2276 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64,
schar)
2277 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32,
ushort)
2278 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32,
short)
2279 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16,
uint)
2280 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16,
int)
2281 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8,
uint64)
2282 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8,
int64)
2283 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16,
float)
2284 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8,
double)
2289 static const __m512i perm = _mm512_set1_epi32((
char)i);
2290 return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
2295 {
return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2299 {
return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2306 __m512i ab0 = _mm512_loadu_si512((
const __m512i*)ptr);
2307 __m512i ab1 = _mm512_loadu_si512((
const __m512i*)(ptr + 64));
2309 __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
2310 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
2311 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2312 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2313 __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
2314 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
2315 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2316 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2317 a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
2318 b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
2320 __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
2321 __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
2322 __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
2323 __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2324 __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2325 a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
2326 b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
2332 __m512i ab0 = _mm512_loadu_si512((
const __m512i*)ptr);
2333 __m512i ab1 = _mm512_loadu_si512((
const __m512i*)(ptr + 32));
2334 __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2335 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2336 __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2337 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2338 a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
2339 b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
2344 __m512i ab0 = _mm512_loadu_si512((
const __m512i*)ptr);
2345 __m512i ab1 = _mm512_loadu_si512((
const __m512i*)(ptr + 16));
2346 __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2347 __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2348 a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
2349 b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
2354 __m512i ab0 = _mm512_loadu_si512((
const __m512i*)ptr);
2355 __m512i ab1 = _mm512_loadu_si512((
const __m512i*)(ptr + 8));
2356 __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2357 __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2358 a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
2359 b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
2364 __m512i bgr0 = _mm512_loadu_si512((
const __m512i*)ptr);
2365 __m512i bgr1 = _mm512_loadu_si512((
const __m512i*)(ptr + 64));
2366 __m512i bgr2 = _mm512_loadu_si512((
const __m512i*)(ptr + 128));
2369 __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81,
2370 78, 75, 72, 69, 66, 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33,
2371 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 62, 59, 56, 53, 50,
2372 47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
2373 __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
2374 __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
2375 __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
2376 _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80,
2377 77, 74, 71, 68, 65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
2378 94, 91, 88, 85, 82, 79, 76, 73, 70, 67, 64, 61, 58, 55, 52, 49,
2379 46, 43, 40, 37, 34, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1), bgr2);
2380 a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
2381 b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
2382 c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
2383 #elif CV_AVX_512VBMI
2384 __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
2385 __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
2386 __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
2387 a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80,
2388 77, 74, 71, 68, 65, 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48,
2389 46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24,
2390 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0), bgr2));
2391 b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40,
2392 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, 23, 21, 20, 18, 17,
2393 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 126, 123, 120, 117, 114,
2394 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81, 78, 75, 72, 69, 66), bgr0));
2395 c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18,
2396 15, 12, 9, 6, 3, 0, 125, 122, 119, 116, 113, 110, 107, 104, 101, 98,
2397 95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 62, 59, 56, 53, 50,
2398 47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2), bgr1));
2400 __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
2401 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2402 __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
2403 __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
2404 __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
2406 __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
2407 __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
2408 14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
2409 __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
2410 a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
2411 c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
2412 b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
2418 __m512i bgr0 = _mm512_loadu_si512((
const __m512i*)ptr);
2419 __m512i bgr1 = _mm512_loadu_si512((
const __m512i*)(ptr + 32));
2420 __m512i bgr2 = _mm512_loadu_si512((
const __m512i*)(ptr + 64));
2422 __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
2423 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2424 __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
2425 __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
2426 __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
2428 a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
2429 b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
2430 14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
2431 c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
2434 inline void v_load_deinterleave(
const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )
2436 __m512i bgr0 = _mm512_loadu_si512((
const __m512i*)ptr);
2437 __m512i bgr1 = _mm512_loadu_si512((
const __m512i*)(ptr + 16));
2438 __m512i bgr2 = _mm512_loadu_si512((
const __m512i*)(ptr + 32));
2440 __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2441 __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
2442 __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
2443 __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);
2445 a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
2446 b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
2447 c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
2452 __m512i bgr0 = _mm512_loadu_si512((
const __m512i*)ptr);
2453 __m512i bgr1 = _mm512_loadu_si512((
const __m512i*)(ptr + 8));
2454 __m512i bgr2 = _mm512_loadu_si512((
const __m512i*)(ptr + 16));
2456 __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
2457 __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
2458 __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
2459 __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);
2461 a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
2462 c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
2463 b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
2468 __m512i bgra0 = _mm512_loadu_si512((
const __m512i*)ptr);
2469 __m512i bgra1 = _mm512_loadu_si512((
const __m512i*)(ptr + 64));
2470 __m512i bgra2 = _mm512_loadu_si512((
const __m512i*)(ptr + 128));
2471 __m512i bgra3 = _mm512_loadu_si512((
const __m512i*)(ptr + 192));
2474 __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
2475 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
2476 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2477 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2478 __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
2479 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
2480 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2481 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2483 __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
2484 __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
2485 __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
2486 __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
2488 a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
2489 c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
2490 b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
2491 d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
2493 __m512i
mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
2494 __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0,
mask);
2495 __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1,
mask);
2496 __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2,
mask);
2497 __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3,
mask);
2499 __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2500 __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2502 __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
2503 __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
2504 __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
2505 __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
2507 a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
2508 c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
2509 b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
2510 d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
2516 __m512i bgra0 = _mm512_loadu_si512((
const __m512i*)ptr);
2517 __m512i bgra1 = _mm512_loadu_si512((
const __m512i*)(ptr + 32));
2518 __m512i bgra2 = _mm512_loadu_si512((
const __m512i*)(ptr + 64));
2519 __m512i bgra3 = _mm512_loadu_si512((
const __m512i*)(ptr + 96));
2521 __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2522 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2523 __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2524 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2526 __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
2527 __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
2528 __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
2529 __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);
2531 a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
2532 c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
2533 b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
2534 d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
2537 inline void v_load_deinterleave(
const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )
2539 __m512i bgra0 = _mm512_loadu_si512((
const __m512i*)ptr);
2540 __m512i bgra1 = _mm512_loadu_si512((
const __m512i*)(ptr + 16));
2541 __m512i bgra2 = _mm512_loadu_si512((
const __m512i*)(ptr + 32));
2542 __m512i bgra3 = _mm512_loadu_si512((
const __m512i*)(ptr + 48));
2544 __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2545 __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2547 __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
2548 __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
2549 __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
2550 __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);
2552 a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
2553 c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
2554 b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
2555 d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
2560 __m512i bgra0 = _mm512_loadu_si512((
const __m512i*)ptr);
2561 __m512i bgra1 = _mm512_loadu_si512((
const __m512i*)(ptr + 8));
2562 __m512i bgra2 = _mm512_loadu_si512((
const __m512i*)(ptr + 16));
2563 __m512i bgra3 = _mm512_loadu_si512((
const __m512i*)(ptr + 24));
2565 __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2566 __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2568 __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
2569 __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
2570 __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
2571 __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);
2573 a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
2574 c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
2575 b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
2576 d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
2584 v_uint8x64 low, high;
2588 _mm512_stream_si512((__m512i*)ptr, low.val);
2589 _mm512_stream_si512((__m512i*)(ptr + 64), high.val);
2593 _mm512_store_si512((__m512i*)ptr, low.val);
2594 _mm512_store_si512((__m512i*)(ptr + 64), high.val);
2598 _mm512_storeu_si512((__m512i*)ptr, low.val);
2599 _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
2606 v_uint16x32 low, high;
2610 _mm512_stream_si512((__m512i*)ptr, low.val);
2611 _mm512_stream_si512((__m512i*)(ptr + 32), high.val);
2615 _mm512_store_si512((__m512i*)ptr, low.val);
2616 _mm512_store_si512((__m512i*)(ptr + 32), high.val);
2620 _mm512_storeu_si512((__m512i*)ptr, low.val);
2621 _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
2628 v_uint32x16 low, high;
2632 _mm512_stream_si512((__m512i*)ptr, low.val);
2633 _mm512_stream_si512((__m512i*)(ptr + 16), high.val);
2637 _mm512_store_si512((__m512i*)ptr, low.val);
2638 _mm512_store_si512((__m512i*)(ptr + 16), high.val);
2642 _mm512_storeu_si512((__m512i*)ptr, low.val);
2643 _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
2650 v_uint64x8 low, high;
2654 _mm512_stream_si512((__m512i*)ptr, low.val);
2655 _mm512_stream_si512((__m512i*)(ptr + 8), high.val);
2659 _mm512_store_si512((__m512i*)ptr, low.val);
2660 _mm512_store_si512((__m512i*)(ptr + 8), high.val);
2664 _mm512_storeu_si512((__m512i*)ptr, low.val);
2665 _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
2673 __m512i mask0 = _v512_set_epu8(127, 84, 20, 126, 83, 19, 125, 82, 18, 124, 81, 17, 123, 80, 16, 122,
2674 79, 15, 121, 78, 14, 120, 77, 13, 119, 76, 12, 118, 75, 11, 117, 74,
2675 10, 116, 73, 9, 115, 72, 8, 114, 71, 7, 113, 70, 6, 112, 69, 5,
2676 111, 68, 4, 110, 67, 3, 109, 66, 2, 108, 65, 1, 107, 64, 0, 106);
2677 __m512i mask1 = _v512_set_epu8( 21, 42, 105, 20, 41, 104, 19, 40, 103, 18, 39, 102, 17, 38, 101, 16,
2678 37, 100, 15, 36, 99, 14, 35, 98, 13, 34, 97, 12, 33, 96, 11, 32,
2679 95, 10, 31, 94, 9, 30, 93, 8, 29, 92, 7, 28, 91, 6, 27, 90,
2680 5, 26, 89, 4, 25, 88, 3, 24, 87, 2, 23, 86, 1, 22, 85, 0);
2681 __m512i mask2 = _v512_set_epu8(106, 127, 63, 105, 126, 62, 104, 125, 61, 103, 124, 60, 102, 123, 59, 101,
2682 122, 58, 100, 121, 57, 99, 120, 56, 98, 119, 55, 97, 118, 54, 96, 117,
2683 53, 95, 116, 52, 94, 115, 51, 93, 114, 50, 92, 113, 49, 91, 112, 48,
2684 90, 111, 47, 89, 110, 46, 88, 109, 45, 87, 108, 44, 86, 107, 43, 85);
2685 __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);
2686 __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);
2687 __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);
2689 __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
2690 __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
2691 __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
2693 __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
2694 __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);
2695 __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);
2696 __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);
2698 __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37,
2699 5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0);
2700 __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
2701 47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
2702 __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
2703 26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
2704 __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
2705 __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
2706 __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);
2708 __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
2709 __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
2710 __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
2715 _mm512_stream_si512((__m512i*)ptr, bgr0);
2716 _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
2717 _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
2721 _mm512_store_si512((__m512i*)ptr, bgr0);
2722 _mm512_store_si512((__m512i*)(ptr + 64), bgr1);
2723 _mm512_store_si512((__m512i*)(ptr + 128), bgr2);
2727 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2728 _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
2729 _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
2736 __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37,
2737 5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0);
2738 __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
2739 47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
2740 __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
2741 26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
2742 __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);
2743 __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);
2744 __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);
2746 __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
2747 __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
2748 __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
2752 _mm512_stream_si512((__m512i*)ptr, bgr0);
2753 _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
2754 _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
2758 _mm512_store_si512((__m512i*)ptr, bgr0);
2759 _mm512_store_si512((__m512i*)(ptr + 32), bgr1);
2760 _mm512_store_si512((__m512i*)(ptr + 64), bgr2);
2764 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2765 _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
2766 _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
2770 inline void v_store_interleave(
unsigned* ptr,
const v_uint32x16& a,
const v_uint32x16& b,
const v_uint32x16& c,
2773 __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
2774 __m512i mask1 = _v512_set_epu32(31, 10, 25, 30, 9, 24, 29, 8, 23, 28, 7, 22, 27, 6, 21, 26);
2775 __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);
2776 __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);
2778 __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);
2779 __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
2780 __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);
2784 _mm512_stream_si512((__m512i*)ptr, bgr0);
2785 _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
2786 _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
2790 _mm512_store_si512((__m512i*)ptr, bgr0);
2791 _mm512_store_si512((__m512i*)(ptr + 16), bgr1);
2792 _mm512_store_si512((__m512i*)(ptr + 32), bgr2);
2796 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2797 _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
2798 _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
2805 __m512i mask0 = _v512_set_epu64( 5, 12, 7, 4, 11, 6, 3, 10);
2806 __m512i mask1 = _v512_set_epu64(15, 7, 4, 14, 6, 3, 13, 5);
2807 __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);
2808 __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);
2810 __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);
2811 __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
2812 __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);
2816 _mm512_stream_si512((__m512i*)ptr, bgr0);
2817 _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
2818 _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
2822 _mm512_store_si512((__m512i*)ptr, bgr0);
2823 _mm512_store_si512((__m512i*)(ptr + 8), bgr1);
2824 _mm512_store_si512((__m512i*)(ptr + 16), bgr2);
2828 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2829 _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
2830 _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
2835 const v_uint8x64& c,
const v_uint8x64& d,
2838 v_uint8x64 br01, br23, ga01, ga23;
2839 v_zip(a, c, br01, br23);
2840 v_zip(b, d, ga01, ga23);
2841 v_uint8x64 bgra0, bgra1, bgra2, bgra3;
2842 v_zip(br01, ga01, bgra0, bgra1);
2843 v_zip(br23, ga23, bgra2, bgra3);
2847 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2848 _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
2849 _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
2850 _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
2854 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2855 _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
2856 _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
2857 _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
2861 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2862 _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
2863 _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
2864 _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
2869 const v_uint16x32& c,
const v_uint16x32& d,
2872 v_uint16x32 br01, br23, ga01, ga23;
2873 v_zip(a, c, br01, br23);
2874 v_zip(b, d, ga01, ga23);
2875 v_uint16x32 bgra0, bgra1, bgra2, bgra3;
2876 v_zip(br01, ga01, bgra0, bgra1);
2877 v_zip(br23, ga23, bgra2, bgra3);
2881 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2882 _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
2883 _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
2884 _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
2888 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2889 _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
2890 _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
2891 _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
2895 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2896 _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
2897 _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
2898 _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
2902 inline void v_store_interleave(
unsigned* ptr,
const v_uint32x16& a,
const v_uint32x16& b,
2903 const v_uint32x16& c,
const v_uint32x16& d,
2906 v_uint32x16 br01, br23, ga01, ga23;
2907 v_zip(a, c, br01, br23);
2908 v_zip(b, d, ga01, ga23);
2909 v_uint32x16 bgra0, bgra1, bgra2, bgra3;
2910 v_zip(br01, ga01, bgra0, bgra1);
2911 v_zip(br23, ga23, bgra2, bgra3);
2915 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2916 _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
2917 _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
2918 _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
2922 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2923 _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
2924 _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
2925 _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
2929 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2930 _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
2931 _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
2932 _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
2937 const v_uint64x8& c,
const v_uint64x8& d,
2940 v_uint64x8 br01, br23, ga01, ga23;
2941 v_zip(a, c, br01, br23);
2942 v_zip(b, d, ga01, ga23);
2943 v_uint64x8 bgra0, bgra1, bgra2, bgra3;
2944 v_zip(br01, ga01, bgra0, bgra1);
2945 v_zip(br23, ga23, bgra2, bgra3);
2949 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2950 _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
2951 _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
2952 _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
2956 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2957 _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
2958 _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
2959 _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
2963 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2964 _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
2965 _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
2966 _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
2970 #define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2971 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2974 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2975 a0 = v_reinterpret_as_##suffix0(a1); \
2976 b0 = v_reinterpret_as_##suffix0(b1); \
2978 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2980 _Tpvec1 a1, b1, c1; \
2981 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2982 a0 = v_reinterpret_as_##suffix0(a1); \
2983 b0 = v_reinterpret_as_##suffix0(b1); \
2984 c0 = v_reinterpret_as_##suffix0(c1); \
2986 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2988 _Tpvec1 a1, b1, c1, d1; \
2989 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2990 a0 = v_reinterpret_as_##suffix0(a1); \
2991 b0 = v_reinterpret_as_##suffix0(b1); \
2992 c0 = v_reinterpret_as_##suffix0(c1); \
2993 d0 = v_reinterpret_as_##suffix0(d1); \
2995 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2996 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2998 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2999 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3000 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
3002 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
3003 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3005 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3006 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3007 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3008 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
3010 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3011 const _Tpvec0& c0, const _Tpvec0& d0, \
3012 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3014 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3015 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3016 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3017 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
3018 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3021 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64,
schar, s8, v_uint8x64,
uchar, u8)
3022 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32,
short, s16, v_uint16x32,
ushort, u16)
3023 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16,
int, s32, v_uint32x16,
unsigned, u32)
3024 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16,
float, f32, v_uint32x16,
unsigned, u32)
3025 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8,
int64, s64, v_uint64x8,
uint64, u64)
3026 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8,
double, f64, v_uint64x8,
uint64, u64)
3032 inline int v_signmask(
const v_int16x32& a) {
return (
int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3033 inline int v_signmask(
const v_int32x16& a) {
return (
int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3034 inline int v_signmask(
const v_int64x8& a) {
return (
int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3044 inline bool v_check_all(
const v_int8x64& a) {
return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3045 inline bool v_check_any(
const v_int8x64& a) {
return (
bool)_mm512_movepi8_mask(a.val); }
3046 inline bool v_check_all(
const v_int16x32& a) {
return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3047 inline bool v_check_any(
const v_int16x32& a) {
return (
bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3048 inline bool v_check_all(
const v_int32x16& a) {
return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3049 inline bool v_check_any(
const v_int32x16& a) {
return (
bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3050 inline bool v_check_all(
const v_int64x8& a) {
return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3051 inline bool v_check_any(
const v_int64x8& a) {
return (
bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3068 int64 mask = _mm512_movepi8_mask(a.val);
3069 int mask32 = (int)
mask;
3070 return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(
mask >> 32)) : 0;
3075 inline int v_scan_forward(
const v_int32x16& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s16(a))) / 2; }
3076 inline int v_scan_forward(
const v_uint32x16& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s16(a))) / 2; }
3077 inline int v_scan_forward(
const v_float32x16& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s16(a))) / 2; }
3079 inline int v_scan_forward(
const v_uint64x8& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s16(a))) / 4; }
3080 inline int v_scan_forward(
const v_float64x8& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s16(a))) / 4; }
3082 inline void v512_cleanup() { _mm256_zeroall(); }
3084 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition: intrin_cpp.hpp:1515
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition: intrin_cpp.hpp:1554
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2216
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition: intrin_cpp.hpp:1496
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
OutputArray sum
Definition: imgproc.hpp:2882
StoreMode
Definition: intrin.hpp:100
@ STORE_ALIGNED_NOCACHE
Definition: intrin.hpp:103
@ STORE_ALIGNED
Definition: intrin.hpp:102
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition: dualquaternion.inl.hpp:274