EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_avx512.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html
4
5#ifndef OPENCV_HAL_INTRIN_AVX512_HPP
6#define OPENCV_HAL_INTRIN_AVX512_HPP
7
8#if defined(_MSC_VER) && (_MSC_VER < 1920/*MSVS2019*/)
9# pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned
10# pragma warning(disable:4309) // 'argument': truncation of constant value
11# pragma warning(disable:4310) // cast truncates constant value
12#endif
13
14#define CVT_ROUND_MODES_IMPLEMENTED 0
15
16#define CV_SIMD512 1
17#define CV_SIMD512_64F 1
18#define CV_SIMD512_FP16 0 // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
19
20#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
21#define _v512_set_epu32(a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
22 _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
23 ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
24#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
25 a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
26 _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
27 ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
28 ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
29 ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
30#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
31 a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
32 a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
33 a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
34 _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
35 ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
36 ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
37 ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
38 ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
39 ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
40 ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
41 ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
42#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
43 a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
44 a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
45 a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
46 _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
47 (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
48 (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
49 (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
50 (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
51 (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
52 (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
53 (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))
54
55#ifndef _mm512_cvtpd_pslo
56#ifdef _mm512_zextsi256_si512
57#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
58#else
59//if preferred way to extend with zeros is unavailable
60#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
61#endif
62#endif
64
65namespace
66{
67
68inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi)
69{ return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }
70
71inline __m512 _v512_combine(const __m256& lo, const __m256& hi)
72{ return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }
73
74inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi)
75{ return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }
76
77inline int _v_cvtsi512_si32(const __m512i& a)
78{ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }
79
80inline __m256i _v512_extract_high(const __m512i& v)
81{ return _mm512_extracti32x8_epi32(v, 1); }
82
83inline __m256 _v512_extract_high(const __m512& v)
84{ return _mm512_extractf32x8_ps(v, 1); }
85
86inline __m256d _v512_extract_high(const __m512d& v)
87{ return _mm512_extractf64x4_pd(v, 1); }
88
89inline __m256i _v512_extract_low(const __m512i& v)
90{ return _mm512_castsi512_si256(v); }
91
92inline __m256 _v512_extract_low(const __m512& v)
93{ return _mm512_castps512_ps256(v); }
94
95inline __m256d _v512_extract_low(const __m512d& v)
96{ return _mm512_castpd512_pd256(v); }
97
98inline __m512i _v512_insert(const __m512i& a, const __m256i& b)
99{ return _mm512_inserti32x8(a, b, 0); }
100
101inline __m512 _v512_insert(const __m512& a, const __m256& b)
102{ return _mm512_insertf32x8(a, b, 0); }
103
104inline __m512d _v512_insert(const __m512d& a, const __m256d& b)
105{ return _mm512_insertf64x4(a, b, 0); }
106
107}
108
109namespace cv
110{
111
113
114CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
115
117
118struct v_uint8x64
119{
120 typedef uchar lane_type;
121 enum { nlanes = 64 };
122 __m512i val;
123
124 explicit v_uint8x64(__m512i v) : val(v) {}
125 v_uint8x64(uchar v0, uchar v1, uchar v2, uchar v3,
126 uchar v4, uchar v5, uchar v6, uchar v7,
127 uchar v8, uchar v9, uchar v10, uchar v11,
128 uchar v12, uchar v13, uchar v14, uchar v15,
129 uchar v16, uchar v17, uchar v18, uchar v19,
130 uchar v20, uchar v21, uchar v22, uchar v23,
131 uchar v24, uchar v25, uchar v26, uchar v27,
132 uchar v28, uchar v29, uchar v30, uchar v31,
133 uchar v32, uchar v33, uchar v34, uchar v35,
134 uchar v36, uchar v37, uchar v38, uchar v39,
135 uchar v40, uchar v41, uchar v42, uchar v43,
136 uchar v44, uchar v45, uchar v46, uchar v47,
137 uchar v48, uchar v49, uchar v50, uchar v51,
138 uchar v52, uchar v53, uchar v54, uchar v55,
139 uchar v56, uchar v57, uchar v58, uchar v59,
140 uchar v60, uchar v61, uchar v62, uchar v63)
141 {
142 val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
143 v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
144 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
145 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
146 }
147 v_uint8x64() {}
148
149 static inline v_uint8x64 zero() { return v_uint8x64(_mm512_setzero_si512()); }
150
151 uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }
152};
153
154struct v_int8x64
155{
156 typedef schar lane_type;
157 enum { nlanes = 64 };
158 __m512i val;
159
160 explicit v_int8x64(__m512i v) : val(v) {}
161 v_int8x64(schar v0, schar v1, schar v2, schar v3,
162 schar v4, schar v5, schar v6, schar v7,
163 schar v8, schar v9, schar v10, schar v11,
164 schar v12, schar v13, schar v14, schar v15,
165 schar v16, schar v17, schar v18, schar v19,
166 schar v20, schar v21, schar v22, schar v23,
167 schar v24, schar v25, schar v26, schar v27,
168 schar v28, schar v29, schar v30, schar v31,
169 schar v32, schar v33, schar v34, schar v35,
170 schar v36, schar v37, schar v38, schar v39,
171 schar v40, schar v41, schar v42, schar v43,
172 schar v44, schar v45, schar v46, schar v47,
173 schar v48, schar v49, schar v50, schar v51,
174 schar v52, schar v53, schar v54, schar v55,
175 schar v56, schar v57, schar v58, schar v59,
176 schar v60, schar v61, schar v62, schar v63)
177 {
178 val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
179 v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
180 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
181 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
182 }
183 v_int8x64() {}
184
185 static inline v_int8x64 zero() { return v_int8x64(_mm512_setzero_si512()); }
186
187 schar get0() const { return (schar)_v_cvtsi512_si32(val); }
188};
189
190struct v_uint16x32
191{
192 typedef ushort lane_type;
193 enum { nlanes = 32 };
194 __m512i val;
195
196 explicit v_uint16x32(__m512i v) : val(v) {}
197 v_uint16x32(ushort v0, ushort v1, ushort v2, ushort v3,
198 ushort v4, ushort v5, ushort v6, ushort v7,
199 ushort v8, ushort v9, ushort v10, ushort v11,
200 ushort v12, ushort v13, ushort v14, ushort v15,
201 ushort v16, ushort v17, ushort v18, ushort v19,
202 ushort v20, ushort v21, ushort v22, ushort v23,
203 ushort v24, ushort v25, ushort v26, ushort v27,
204 ushort v28, ushort v29, ushort v30, ushort v31)
205 {
206 val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
207 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
208 }
209 v_uint16x32() {}
210
211 static inline v_uint16x32 zero() { return v_uint16x32(_mm512_setzero_si512()); }
212
213 ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }
214};
215
216struct v_int16x32
217{
218 typedef short lane_type;
219 enum { nlanes = 32 };
220 __m512i val;
221
222 explicit v_int16x32(__m512i v) : val(v) {}
223 v_int16x32(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7,
224 short v8, short v9, short v10, short v11, short v12, short v13, short v14, short v15,
225 short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23,
226 short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31)
227 {
228 val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,
229 (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,
230 (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
231 (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
232 }
233 v_int16x32() {}
234
235 static inline v_int16x32 zero() { return v_int16x32(_mm512_setzero_si512()); }
236
237 short get0() const { return (short)_v_cvtsi512_si32(val); }
238};
239
240struct v_uint32x16
241{
242 typedef unsigned lane_type;
243 enum { nlanes = 16 };
244 __m512i val;
245
246 explicit v_uint32x16(__m512i v) : val(v) {}
247 v_uint32x16(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
248 unsigned v4, unsigned v5, unsigned v6, unsigned v7,
249 unsigned v8, unsigned v9, unsigned v10, unsigned v11,
250 unsigned v12, unsigned v13, unsigned v14, unsigned v15)
251 {
252 val = _mm512_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3, (int)v4, (int)v5, (int)v6, (int)v7,
253 (int)v8, (int)v9, (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);
254 }
255 v_uint32x16() {}
256
257 static inline v_uint32x16 zero() { return v_uint32x16(_mm512_setzero_si512()); }
258
259 unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }
260};
261
262struct v_int32x16
263{
264 typedef int lane_type;
265 enum { nlanes = 16 };
266 __m512i val;
267
268 explicit v_int32x16(__m512i v) : val(v) {}
269 v_int32x16(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
270 int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15)
271 {
272 val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
273 }
274 v_int32x16() {}
275
276 static inline v_int32x16 zero() { return v_int32x16(_mm512_setzero_si512()); }
277
278 int get0() const { return _v_cvtsi512_si32(val); }
279};
280
281struct v_float32x16
282{
283 typedef float lane_type;
284 enum { nlanes = 16 };
285 __m512 val;
286
287 explicit v_float32x16(__m512 v) : val(v) {}
288 v_float32x16(float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7,
289 float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15)
290 {
291 val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
292 }
293 v_float32x16() {}
294
295 static inline v_float32x16 zero() { return v_float32x16(_mm512_setzero_ps()); }
296
297 float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
298};
299
300struct v_uint64x8
301{
302 typedef uint64 lane_type;
303 enum { nlanes = 8 };
304 __m512i val;
305
306 explicit v_uint64x8(__m512i v) : val(v) {}
307 v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
308 { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
309 v_uint64x8() {}
310
311 static inline v_uint64x8 zero() { return v_uint64x8(_mm512_setzero_si512()); }
312
313 uint64 get0() const
314 {
315 #if defined __x86_64__ || defined _M_X64
316 return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
317 #else
318 int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
319 int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
320 return (unsigned)a | ((uint64)(unsigned)b << 32);
321 #endif
322 }
323};
324
325struct v_int64x8
326{
327 typedef int64 lane_type;
328 enum { nlanes = 8 };
329 __m512i val;
330
331 explicit v_int64x8(__m512i v) : val(v) {}
332 v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
333 { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
334 v_int64x8() {}
335
336 static inline v_int64x8 zero() { return v_int64x8(_mm512_setzero_si512()); }
337
338 int64 get0() const
339 {
340 #if defined __x86_64__ || defined _M_X64
341 return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
342 #else
343 int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
344 int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
345 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
346 #endif
347 }
348};
349
350struct v_float64x8
351{
352 typedef double lane_type;
353 enum { nlanes = 8 };
354 __m512d val;
355
356 explicit v_float64x8(__m512d v) : val(v) {}
357 v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
358 { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
359 v_float64x8() {}
360
361 static inline v_float64x8 zero() { return v_float64x8(_mm512_setzero_pd()); }
362
363 double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
364};
365
367
368#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp) \
369 inline _Tpvec v512_load(const _Tp* ptr) \
370 { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); } \
371 inline _Tpvec v512_load_aligned(const _Tp* ptr) \
372 { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); } \
373 inline _Tpvec v512_load_low(const _Tp* ptr) \
374 { \
375 __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr); \
376 return _Tpvec(_mm512_castsi256_si512(v256)); \
377 } \
378 inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
379 { \
380 __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0); \
381 __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1); \
382 return _Tpvec(_v512_combine(vlo, vhi)); \
383 } \
384 inline void v_store(_Tp* ptr, const _Tpvec& a) \
385 { _mm512_storeu_si512((__m512i*)ptr, a.val); } \
386 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
387 { _mm512_store_si512((__m512i*)ptr, a.val); } \
388 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
389 { _mm512_stream_si512((__m512i*)ptr, a.val); } \
390 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
391 { \
392 if( mode == hal::STORE_UNALIGNED ) \
393 _mm512_storeu_si512((__m512i*)ptr, a.val); \
394 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
395 _mm512_stream_si512((__m512i*)ptr, a.val); \
396 else \
397 _mm512_store_si512((__m512i*)ptr, a.val); \
398 } \
399 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
400 { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); } \
401 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
402 { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }
403
404OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64, uchar)
405OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64, schar)
406OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)
407OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32, short)
408OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16, unsigned)
409OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16, int)
410OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8, uint64)
411OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8, int64)
412
413#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
414 inline _Tpvec v512_load(const _Tp* ptr) \
415 { return _Tpvec(_mm512_loadu_##suffix(ptr)); } \
416 inline _Tpvec v512_load_aligned(const _Tp* ptr) \
417 { return _Tpvec(_mm512_load_##suffix(ptr)); } \
418 inline _Tpvec v512_load_low(const _Tp* ptr) \
419 { \
420 return _Tpvec(_mm512_cast##suffix##256_##suffix##512 \
421 (_mm256_loadu_##suffix(ptr))); \
422 } \
423 inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
424 { \
425 halfreg vlo = _mm256_loadu_##suffix(ptr0); \
426 halfreg vhi = _mm256_loadu_##suffix(ptr1); \
427 return _Tpvec(_v512_combine(vlo, vhi)); \
428 } \
429 inline void v_store(_Tp* ptr, const _Tpvec& a) \
430 { _mm512_storeu_##suffix(ptr, a.val); } \
431 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
432 { _mm512_store_##suffix(ptr, a.val); } \
433 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
434 { _mm512_stream_##suffix(ptr, a.val); } \
435 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
436 { \
437 if( mode == hal::STORE_UNALIGNED ) \
438 _mm512_storeu_##suffix(ptr, a.val); \
439 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
440 _mm512_stream_##suffix(ptr, a.val); \
441 else \
442 _mm512_store_##suffix(ptr, a.val); \
443 } \
444 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
445 { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); } \
446 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
447 { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }
448
449OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float, ps, __m256)
450OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
451
452#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
453 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
454 { return _Tpvec(cast(a.val)); }
455
456#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
457 inline _Tpvec v512_setzero_##suffix() \
458 { return _Tpvec(_mm512_setzero_si512()); } \
459 inline _Tpvec v512_setall_##suffix(_Tp v) \
460 { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); } \
461 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, OPENCV_HAL_NOP) \
462 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, OPENCV_HAL_NOP) \
463 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, OPENCV_HAL_NOP) \
464 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, OPENCV_HAL_NOP) \
465 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, OPENCV_HAL_NOP) \
466 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, OPENCV_HAL_NOP) \
467 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, OPENCV_HAL_NOP) \
468 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, OPENCV_HAL_NOP) \
469 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
470 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8, suffix, _mm512_castpd_si512)
471
472OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64, uchar, u8, epi8, char)
473OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64, schar, s8, epi8, char)
474OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort, u16, epi16, short)
475OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32, short, s16, epi16, short)
476OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32, int)
477OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16, int, s32, epi32, int)
478OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8, uint64, u64, epi64, int64)
479OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8, int64, s64, epi64, int64)
480
481#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
482 inline _Tpvec v512_setzero_##suffix() \
483 { return _Tpvec(_mm512_setzero_##zsuffix()); } \
484 inline _Tpvec v512_setall_##suffix(_Tp v) \
485 { return _Tpvec(_mm512_set1_##zsuffix(v)); } \
486 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, cast) \
487 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, cast) \
488 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast) \
489 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, cast) \
490 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast) \
491 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, cast) \
492 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, cast) \
493 OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, cast)
494
495OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float, f32, ps, _mm512_castsi512_ps)
496OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8, double, f64, pd, _mm512_castsi512_pd)
497
498inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a)
499{ return a; }
500inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a)
501{ return v_float32x16(_mm512_castpd_ps(a.val)); }
502
503inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a)
504{ return a; }
505inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
506{ return v_float64x8(_mm512_castps_pd(a.val)); }
507
508// FP16
509inline v_float32x16 v512_load_expand(const hfloat* ptr)
510{
511 return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
512}
513
514inline void v_pack_store(hfloat* ptr, const v_float32x16& a)
515{
516 __m256i ah = _mm512_cvtps_ph(a.val, 0);
517 _mm256_storeu_si256((__m256i*)ptr, ah);
518}
519
520/* Recombine & ZIP */
521inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
522{
523#if CV_AVX_512VBMI
524 __m512i mask0 = _v512_set_epu8( 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24,
525 87, 23, 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16,
526 79, 15, 78, 14, 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8,
527 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, 64, 0);
528 ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
529 __m512i mask1 = _v512_set_epu8(127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
530 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
531 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
532 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32);
533 ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
534#else
535 __m512i low = _mm512_unpacklo_epi8(a.val, b.val);
536 __m512i high = _mm512_unpackhi_epi8(a.val, b.val);
537 ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2, 9, 8, 1, 0), high));
538 ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
539#endif
540}
541inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
542{
543 __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8,
544 39, 7, 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0);
545 ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
546 __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
547 55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
548 ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
549}
550inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
551{
552 __m512i mask0 = _v512_set_epu32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
553 ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
554 __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
555 ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
556}
557inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
558{
559 __m512i mask0 = _v512_set_epu64(11, 3, 10, 2, 9, 1, 8, 0);
560 ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
561 __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
562 ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
563}
564
565inline void v_zip(const v_uint8x64& a, const v_uint8x64& b, v_uint8x64& ab0, v_uint8x64& ab1)
566{
567 v_int8x64 i0, i1;
568 v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
569 ab0 = v_reinterpret_as_u8(i0);
570 ab1 = v_reinterpret_as_u8(i1);
571}
572inline void v_zip(const v_uint16x32& a, const v_uint16x32& b, v_uint16x32& ab0, v_uint16x32& ab1)
573{
574 v_int16x32 i0, i1;
575 v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
576 ab0 = v_reinterpret_as_u16(i0);
577 ab1 = v_reinterpret_as_u16(i1);
578}
579inline void v_zip(const v_uint32x16& a, const v_uint32x16& b, v_uint32x16& ab0, v_uint32x16& ab1)
580{
581 v_int32x16 i0, i1;
582 v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
583 ab0 = v_reinterpret_as_u32(i0);
584 ab1 = v_reinterpret_as_u32(i1);
585}
586inline void v_zip(const v_uint64x8& a, const v_uint64x8& b, v_uint64x8& ab0, v_uint64x8& ab1)
587{
588 v_int64x8 i0, i1;
589 v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
590 ab0 = v_reinterpret_as_u64(i0);
591 ab1 = v_reinterpret_as_u64(i1);
592}
593inline void v_zip(const v_float32x16& a, const v_float32x16& b, v_float32x16& ab0, v_float32x16& ab1)
594{
595 v_int32x16 i0, i1;
596 v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
597 ab0 = v_reinterpret_as_f32(i0);
598 ab1 = v_reinterpret_as_f32(i1);
599}
600inline void v_zip(const v_float64x8& a, const v_float64x8& b, v_float64x8& ab0, v_float64x8& ab1)
601{
602 v_int64x8 i0, i1;
603 v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
604 ab0 = v_reinterpret_as_f64(i0);
605 ab1 = v_reinterpret_as_f64(i1);
606}
607
608#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix) \
609 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
610 { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
611 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
612 { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); } \
613 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
614 _Tpvec& c, _Tpvec& d) \
615 { \
616 c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val)); \
617 d.val = _v512_insert(b.val,_v512_extract_high(a.val)); \
618 }
619
620
621OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64, epi8)
622OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64, epi8)
623OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32, epi16)
624OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32, epi16)
625OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16, epi32)
626OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16, epi32)
627OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8, epi64)
628OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8, epi64)
629OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
630OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8, pd)
631
632
633
634/* Element-wise binary and unary operations */
635
636
637#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
638 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
639 { return _Tpvec(intrin(a.val, b.val)); }
640
641OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
642OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
643OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
644OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
645OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
646OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
647OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
648OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
649OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
650OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)
651
652inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b)
653{
654 __m512i ad = _mm512_srai_epi16(a.val, 8);
655 __m512i bd = _mm512_srai_epi16(b.val, 8);
656 __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even
657 __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd
658 return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
659}
660inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
661{
662 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
663}
664
665#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \
666 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
667 { return _Tpvec(intrin(a.val, b.val)); } \
668 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
669 { a.val = intrin(a.val, b.val); return a; }
670
671OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
672OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
673OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
674OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
675OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
676OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
677OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
678OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
679
680OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
681OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
682OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
683OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
684
685
686OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8)
687OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8)
688OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8)
689OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8)
690OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
691OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
692OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16)
693OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16)
694
695OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
696OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
697OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
698OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
699OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
700OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
701OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
702OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
703
704// saturating multiply
705inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
706{
707 v_uint16x32 c, d;
708 v_mul_expand(a, b, c, d);
709 return v_pack(c, d);
710}
711inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
712{
713 v_int16x32 c, d;
714 v_mul_expand(a, b, c, d);
715 return v_pack(c, d);
716}
717inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
718{
719 __m512i pl = _mm512_mullo_epi16(a.val, b.val);
720 __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
721 __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
722 __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
723
724 const __m512i m = _mm512_set1_epi32(65535);
725 return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
726}
727inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
728{
729 __m512i pl = _mm512_mullo_epi16(a.val, b.val);
730 __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
731 __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
732 __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
733 return v_int16x32(_mm512_packs_epi32(p0, p1));
734}
735
736inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
737{ a = a * b; return a; }
738inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
739{ a = a * b; return a; }
740inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
741{ a = a * b; return a; }
742inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
743{ a = a * b; return a; }
744
745inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
746inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
747
748// Multiply and expand
749inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b,
750 v_uint16x32& c, v_uint16x32& d)
751{
752 v_uint16x32 a0, a1, b0, b1;
753 v_expand(a, a0, a1);
754 v_expand(b, b0, b1);
755 c = v_mul_wrap(a0, b0);
756 d = v_mul_wrap(a1, b1);
757}
758
759inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b,
760 v_int16x32& c, v_int16x32& d)
761{
762 v_int16x32 a0, a1, b0, b1;
763 v_expand(a, a0, a1);
764 v_expand(b, b0, b1);
765 c = v_mul_wrap(a0, b0);
766 d = v_mul_wrap(a1, b1);
767}
768
769inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b,
770 v_int32x16& c, v_int32x16& d)
771{
772 v_int16x32 v0, v1;
773 v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
774
775 c = v_reinterpret_as_s32(v0);
776 d = v_reinterpret_as_s32(v1);
777}
778
779inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b,
780 v_uint32x16& c, v_uint32x16& d)
781{
782 v_uint16x32 v0, v1;
783 v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
784
785 c = v_reinterpret_as_u32(v0);
786 d = v_reinterpret_as_u32(v1);
787}
788
789inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b,
790 v_uint64x8& c, v_uint64x8& d)
791{
792 v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
793 v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
794}
795
796inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
797 v_int64x8& c, v_int64x8& d)
798{
799 v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
800 v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
801}
802
804#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
805 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
806 { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
807 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
808 { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
809 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
810 { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
811 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
812 { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \
813 template<int imm> \
814 inline _Tpuvec v_shl(const _Tpuvec& a) \
815 { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
816 template<int imm> \
817 inline _Tpsvec v_shl(const _Tpsvec& a) \
818 { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
819 template<int imm> \
820 inline _Tpuvec v_shr(const _Tpuvec& a) \
821 { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
822 template<int imm> \
823 inline _Tpsvec v_shr(const _Tpsvec& a) \
824 { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }
825
826OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
827OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
828OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64)
829
830
831
832#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
833 OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \
834 OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \
835 OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \
836 inline _Tpvec operator ~ (const _Tpvec& a) \
837 { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
838
839OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1))
840OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64, si512, _mm512_set1_epi32(-1))
841OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32, si512, _mm512_set1_epi32(-1))
842OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32, si512, _mm512_set1_epi32(-1))
843OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16, si512, _mm512_set1_epi32(-1))
844OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16, si512, _mm512_set1_epi32(-1))
845OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8, si512, _mm512_set1_epi64(-1))
846OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8, si512, _mm512_set1_epi64(-1))
847OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps, _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
848OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8, pd, _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
849
851#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf) \
852 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
853 { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }
854
855OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64, epi8, si512)
856OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64, epi8, si512)
857OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
858OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32, epi16, si512)
859OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
860OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16, epi32, si512)
861OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8, epi64, si512)
862OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8, epi64, si512)
863OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16, ps, ps)
864OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd)
865
866
867#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
868 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
869 { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
870
871#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \
872 OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
873 OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
874 OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
875 OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
876 OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
877 OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
878
879OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (char)-1)
880OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (char)-1)
881OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)
882OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32, epi16, epi16, (short)-1)
883OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)
884OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16, epi32, epi32, (int)-1)
885OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (int64)-1)
886OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (int64)-1)
887
888#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
889 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
890 { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
891
892#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \
893 OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
894 OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
895 OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
896 OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
897 OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
898 OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
899
900OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
901OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (int64)-1)
902
903inline v_float32x16 v_not_nan(const v_float32x16& a)
904{ return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }
905inline v_float64x8 v_not_nan(const v_float64x8& a)
906{ return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }
907
909OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64, _mm512_min_epu8)
910OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64, _mm512_max_epu8)
911OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64, _mm512_min_epi8)
912OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64, _mm512_max_epi8)
913OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32, _mm512_min_epu16)
914OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32, _mm512_max_epu16)
915OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32, _mm512_min_epi16)
916OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32, _mm512_max_epi16)
917OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16, _mm512_min_epu32)
918OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16, _mm512_max_epu32)
919OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16, _mm512_min_epi32)
920OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16, _mm512_max_epi32)
921OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8, _mm512_min_epu64)
922OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8, _mm512_max_epu64)
923OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8, _mm512_min_epi64)
924OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8, _mm512_max_epi64)
925OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
926OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
927OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8, _mm512_min_pd)
928OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8, _mm512_max_pd)
929
930
931namespace {
932 template<bool prec, int imm4, bool part, int imm32>
933 struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
934 template<int imm4, int imm32>
935 struct _v_rotate_right<true, imm4, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
936 {
937 return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 ), imm4 *8),
938 _mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));
939 }};
940 template<int imm4>
941 struct _v_rotate_right<true, imm4, false, 15> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
942 {
943 return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15), imm4 *8),
944 _mm512_slli_epi32( b.val, (4-imm4)*8)));
945 }};
946 template<int imm4, int imm32>
947 struct _v_rotate_right<true, imm4, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
948 {
949 return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16), imm4 *8),
950 _mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));
951 }};
952 template<int imm4>
953 struct _v_rotate_right<true, imm4, true, 31> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
954 { return v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};
955 template<int imm32>
956 struct _v_rotate_right<false, 0, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
957 { return v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};
958 template<>
959 struct _v_rotate_right<false, 0, false, 0> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64&) { return a; }};
960 template<int imm32>
961 struct _v_rotate_right<false, 0, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
962 { return v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};
963 template<>
964 struct _v_rotate_right<false, 0, true, 16> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) { return b; }};
965 template<>
966 struct _v_rotate_right<false, 0, true, 32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
967}
968template<int imm> inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b)
969{
970 return imm >= 128 ? v_int8x64() :
971#if CV_AVX_512VBMI
972 v_int8x64(_mm512_permutex2var_epi8(a.val,
973 _v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,
974 0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,
975 0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,
976 0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,
977 0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,
978 0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,
979 0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,
980 0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));
981#else
982 _v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);
983#endif
984}
985template<int imm>
986inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b)
987{
988 if (imm == 0) return a;
989 if (imm == 64) return b;
990 if (imm >= 128) return v_int8x64();
991#if CV_AVX_512VBMI
992 return v_int8x64(_mm512_permutex2var_epi8(b.val,
993 _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
994 0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
995 0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
996 0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
997 0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
998 0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
999 0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
1000 0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
1001#else
1002 return imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);
1003#endif
1004}
1005template<int imm>
1006inline v_int8x64 v_rotate_right(const v_int8x64& a)
1007{
1008 if (imm == 0) return a;
1009 if (imm >= 64) return v_int8x64();
1010#if CV_AVX_512VBMI
1011 return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
1012 _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
1013 0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
1014 0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
1015 0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
1016 0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
1017 0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
1018 0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
1019 0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
1020#else
1021 return v_rotate_right<imm>(a, v512_setzero_s8());
1022#endif
1023}
1024template<int imm>
1025inline v_int8x64 v_rotate_left(const v_int8x64& a)
1026{
1027 if (imm == 0) return a;
1028 if (imm >= 64) return v_int8x64();
1029#if CV_AVX_512VBMI
1030 return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
1031 _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
1032 0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
1033 0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
1034 0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
1035 0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
1036 0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
1037 0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
1038 0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
1039#else
1040 return v_rotate_right<64 - imm>(v512_setzero_s8(), a);
1041#endif
1042}
1043
1044#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix) \
1045template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1046{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \
1047template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1048{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \
1049template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1050{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); } \
1051template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1052{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }
1053
1054#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix) \
1055template<int imm> \
1056inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1057{ \
1058 enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \
1059 enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
1060 if (imm == 0) return a; \
1061 if (imm == _Tpvec::nlanes) return b; \
1062 if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
1063 return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \
1064} \
1065template<int imm> \
1066inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1067{ \
1068 enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \
1069 enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
1070 if (imm == 0) return a; \
1071 if (imm == _Tpvec::nlanes) return b; \
1072 if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
1073 return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
1074} \
1075template<int imm> \
1076inline _Tpvec v_rotate_left(const _Tpvec& a) \
1077{ \
1078 if (imm == 0) return a; \
1079 if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
1080 return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
1081} \
1082template<int imm> \
1083inline _Tpvec v_rotate_right(const _Tpvec& a) \
1084{ \
1085 if (imm == 0) return a; \
1086 if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
1087 return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
1088}
1089
1090OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64, u8)
1091OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32, u16)
1092OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32, s16)
1093OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16, epi32)
1094OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16, epi32)
1095OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8, epi64)
1096OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8, epi64)
1097OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
1098OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8, pd)
1099
1100
1101inline v_uint8x64 v_reverse(const v_uint8x64 &a)
1102{
1103#if CV_AVX_512VBMI
1104 static const __m512i perm = _mm512_set_epi32(
1105 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1106 0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
1107 0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
1108 0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
1109 return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
1110#else
1111 static const __m512i shuf = _mm512_set_epi32(
1112 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1113 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1114 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1115 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
1116 static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
1117 __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
1118 return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
1119#endif
1120}
1121
1122inline v_int8x64 v_reverse(const v_int8x64 &a)
1123{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1124
1125inline v_uint16x32 v_reverse(const v_uint16x32 &a)
1126{
1127#if CV_AVX_512VBMI
1128 static const __m512i perm = _mm512_set_epi32(
1129 0x00000001, 0x00020003, 0x00040005, 0x00060007,
1130 0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
1131 0x00100011, 0x00120013, 0x00140015, 0x00160017,
1132 0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
1133 return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
1134#else
1135 static const __m512i shuf = _mm512_set_epi32(
1136 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1137 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1138 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1139 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
1140 static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
1141 __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
1142 return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
1143#endif
1144}
1145
1146inline v_int16x32 v_reverse(const v_int16x32 &a)
1147{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1148
1149inline v_uint32x16 v_reverse(const v_uint32x16 &a)
1150{
1151 static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
1152 return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
1153}
1154
1155inline v_int32x16 v_reverse(const v_int32x16 &a)
1156{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1157
1158inline v_float32x16 v_reverse(const v_float32x16 &a)
1159{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1160
1161inline v_uint64x8 v_reverse(const v_uint64x8 &a)
1162{
1163 static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
1164 return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
1165}
1166
1167inline v_int64x8 v_reverse(const v_int64x8 &a)
1168{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1169
1170inline v_float64x8 v_reverse(const v_float64x8 &a)
1171{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1172
1174
1176#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
1177#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop) \
1178 inline sctype v_reduce_##func(const _Tpvec& a) \
1179 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1180 sctype CV_DECL_ALIGNED(64) idx[2]; \
1181 _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
1182 return scop(idx[0], idx[1]); }
1183OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min)
1184OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max)
1185OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1186OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, min, v_int64x8, min_epi64, min)
1187OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, max, v_int64x8, max_epi64, max)
1188OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, sum, v_int64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1189
1190#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop) \
1191 inline double v_reduce_##func(const v_float64x8& a) \
1192 { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1193 double CV_DECL_ALIGNED(64) idx[2]; \
1194 _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
1195 return scop(idx[0], idx[1]); }
1196OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min)
1197OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max)
1198OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1199
1200#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc) \
1201 inline sctype v_reduce_##func(const _Tpvec& a) \
1202 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1203 __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1204 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1205 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1206 return (sctype)_mm_cvtsi128_si32(quarter); }
1207OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32)
1208OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32)
1209OPENCV_HAL_IMPL_AVX512_REDUCE_16(int, min, v_int32x16, min_epi32)
1210OPENCV_HAL_IMPL_AVX512_REDUCE_16(int, max, v_int32x16, max_epi32)
1211
1212#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc) \
1213 inline float v_reduce_##func(const v_float32x16& a) \
1214 { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1215 __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
1216 quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2))); \
1217 quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1))); \
1218 return _mm_cvtss_f32(quarter); }
1219OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)
1220OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)
1221
1222inline float v_reduce_sum(const v_float32x16& a)
1223{
1224 __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
1225 __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
1226 quarter = _mm_hadd_ps(quarter, quarter);
1227 return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
1228}
1229inline int v_reduce_sum(const v_int32x16& a)
1230{
1231 __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
1232 __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1233 quarter = _mm_hadd_epi32(quarter, quarter);
1234 return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
1235}
1236inline uint v_reduce_sum(const v_uint32x16& a)
1237{ return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); }
1238
1239#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc) \
1240 inline sctype v_reduce_##func(const _Tpvec& a) \
1241 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1242 __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1243 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1244 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1245 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \
1246 return (sctype)_mm_cvtsi128_si32(quarter); }
1247OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16)
1248OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16)
1249OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, min, v_int16x32, min_epi16)
1250OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, max, v_int16x32, max_epi16)
1251
1252inline int v_reduce_sum(const v_int16x32& a)
1253{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1254inline uint v_reduce_sum(const v_uint16x32& a)
1255{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1256
1257#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \
1258 inline sctype v_reduce_##func(const _Tpvec& a) \
1259 { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1260 __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1261 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1262 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1263 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \
1264 quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1)); \
1265 return (sctype)_mm_cvtsi128_si32(quarter); }
1266OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8)
1267OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8)
1268OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64, min_epi8)
1269OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64, max_epi8)
1270
1271#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix) \
1272 inline sctype v_reduce_sum(const _Tpvec& a) \
1273 { __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)), \
1274 _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val))); \
1275 a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
1276 __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16)); \
1277 __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1)); \
1278 a4 = _mm_hadd_epi32(a4, a4); \
1279 return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
1280OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)
1281OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int, v_int8x64, epi8)
1282
1283inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b,
1284 const v_float32x16& c, const v_float32x16& d)
1285{
1286 __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
1287 __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
1288 __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
1289 __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
1290 return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
1291}
1292
1293inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b)
1294{
1295 __m512i val = _mm512_sad_epu8(a.val, b.val);
1296 __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
1297 __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1298 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1299}
1300inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
1301{
1302 __m512i val = _mm512_set1_epi8(-128);
1303 val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
1304 __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
1305 __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1306 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1307}
1308inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
1309{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
1310inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
1311{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
1312inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
1313{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
1314inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
1315{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
1316inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
1317{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
1318inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
1319{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
1320
1322inline v_uint8x64 v_popcount(const v_int8x64& a)
1323{
1324#if CV_AVX_512BITALG
1325 return v_uint8x64(_mm512_popcnt_epi8(a.val));
1326#elif CV_AVX_512VBMI
1327 __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
1328 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
1329 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
1330 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
1331 __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
1332 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
1333 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
1334 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
1335 return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
1336#else
1337 __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
1338 __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);
1339
1340 return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512( a.val, _popcnt_mask)),
1341 _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
1342#endif
1343}
1344inline v_uint16x32 v_popcount(const v_int16x32& a)
1345{
1346#if CV_AVX_512BITALG
1347 return v_uint16x32(_mm512_popcnt_epi16(a.val));
1348#elif CV_AVX_512VPOPCNTDQ
1349 __m512i zero = _mm512_setzero_si512();
1350 return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
1351 _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
1352#else
1353 v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
1354 p += v_rotate_right<1>(p);
1355 return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
1356#endif
1357}
1358inline v_uint32x16 v_popcount(const v_int32x16& a)
1359{
1360#if CV_AVX_512VPOPCNTDQ
1361 return v_uint32x16(_mm512_popcnt_epi32(a.val));
1362#else
1363 v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
1364 p += v_rotate_right<1>(p);
1365 p += v_rotate_right<2>(p);
1366 return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
1367#endif
1368}
1369inline v_uint64x8 v_popcount(const v_int64x8& a)
1370{
1371#if CV_AVX_512VPOPCNTDQ
1372 return v_uint64x8(_mm512_popcnt_epi64(a.val));
1373#else
1374 return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
1375#endif
1376}
1377
1378
1379inline v_uint8x64 v_popcount(const v_uint8x64& a) { return v_popcount(v_reinterpret_as_s8 (a)); }
1380inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); }
1381inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); }
1382inline v_uint64x8 v_popcount(const v_uint64x8& a) { return v_popcount(v_reinterpret_as_s64(a)); }
1383
1384
1386
1388#if CV_FMA3
1389#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix) \
1390 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1391 { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); } \
1392 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1393 { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }
1394#else
1395#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix) \
1396 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1397 { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); } \
1398 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1399 { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); }
1400#endif
1401
1402#define OPENCV_HAL_IMPL_AVX512_MISC(_Tpvec, suffix) \
1403 inline _Tpvec v_sqrt(const _Tpvec& x) \
1404 { return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \
1405 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1406 { return v_fma(a, a, b * b); } \
1407 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1408 { return v_sqrt(v_fma(a, a, b * b)); }
1409
1410OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
1411OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd)
1412OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
1413OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8, pd)
1414
1415inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
1416{ return a * b + c; }
1417inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
1418{ return v_fma(a, b, c); }
1419
1420inline v_float32x16 v_invsqrt(const v_float32x16& x)
1421{
1422#if CV_AVX_512ER
1423 return v_float32x16(_mm512_rsqrt28_ps(x.val));
1424#else
1425 v_float32x16 half = x * v512_setall_f32(0.5);
1426 v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(x.val));
1427 t *= v512_setall_f32(1.5) - ((t * t) * half);
1428 return t;
1429#endif
1430}
1431
1432inline v_float64x8 v_invsqrt(const v_float64x8& x)
1433{
1434#if CV_AVX_512ER
1435 return v_float64x8(_mm512_rsqrt28_pd(x.val));
1436#else
1437 return v512_setall_f64(1.) / v_sqrt(x);
1438// v_float64x8 half = x * v512_setall_f64(0.5);
1439// v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
1440// t *= v512_setall_f64(1.5) - ((t * t) * half);
1441// t *= v512_setall_f64(1.5) - ((t * t) * half);
1442// return t;
1443#endif
1444}
1445
1447#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
1448 inline _Tpuvec v_abs(const _Tpvec& x) \
1449 { return _Tpuvec(_mm512_abs_##suffix(x.val)); }
1450
1451OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64, v_uint8x64, epi8)
1452OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32, v_uint16x32, epi16)
1453OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16, v_uint32x16, epi32)
1454OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8, v_uint64x8, epi64)
1455
1456inline v_float32x16 v_abs(const v_float32x16& x)
1457{
1458#ifdef _mm512_abs_pd
1459 return v_float32x16(_mm512_abs_ps(x.val));
1460#else
1461 return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),
1462 _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
1463 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
1464#endif
1465}
1466
1467inline v_float64x8 v_abs(const v_float64x8& x)
1468{
1469#ifdef _mm512_abs_pd
1470 #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
1471 // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476
1472 return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));
1473 #else
1474 return v_float64x8(_mm512_abs_pd(x.val));
1475 #endif
1476#else
1477 return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),
1478 _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
1479 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
1480#endif
1481}
1482
1484inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
1485{ return v_add_wrap(a - b, b - a); }
1486inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
1487{ return v_add_wrap(a - b, b - a); }
1488inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
1489{ return v_max(a, b) - v_min(a, b); }
1490
1491inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
1492{
1493 v_int8x64 d = v_sub_wrap(a, b);
1494 v_int8x64 m = a < b;
1495 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1496}
1497
1498inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
1499{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1500
1501inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
1502{
1503 v_int32x16 d = a - b;
1504 v_int32x16 m = a < b;
1505 return v_reinterpret_as_u32((d ^ m) - m);
1506}
1507
1508inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
1509{ return v_abs(a - b); }
1510
1511inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
1512{ return v_abs(a - b); }
1513
1515inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
1516{
1517 v_int8x64 d = a - b;
1518 v_int8x64 m = a < b;
1519 return (d ^ m) - m;
1520}
1521inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
1522{ return v_max(a, b) - v_min(a, b); }
1523
1525
1527inline v_int32x16 v_round(const v_float32x16& a)
1528{ return v_int32x16(_mm512_cvtps_epi32(a.val)); }
1529
1530inline v_int32x16 v_round(const v_float64x8& a)
1531{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }
1532
1533inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b)
1534{ return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }
1535
1536inline v_int32x16 v_trunc(const v_float32x16& a)
1537{ return v_int32x16(_mm512_cvttps_epi32(a.val)); }
1538
1539inline v_int32x16 v_trunc(const v_float64x8& a)
1540{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }
1541
1542#if CVT_ROUND_MODES_IMPLEMENTED
1543inline v_int32x16 v_floor(const v_float32x16& a)
1544{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }
1545
1546inline v_int32x16 v_floor(const v_float64x8& a)
1547{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }
1548
1549inline v_int32x16 v_ceil(const v_float32x16& a)
1550{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
1551
1552inline v_int32x16 v_ceil(const v_float64x8& a)
1553{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
1554#else
1555inline v_int32x16 v_floor(const v_float32x16& a)
1556{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }
1557
1558inline v_int32x16 v_floor(const v_float64x8& a)
1559{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }
1560
1561inline v_int32x16 v_ceil(const v_float32x16& a)
1562{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }
1563
1564inline v_int32x16 v_ceil(const v_float64x8& a)
1565{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
1566#endif
1567
1569inline v_float32x16 v_cvt_f32(const v_int32x16& a)
1570{ return v_float32x16(_mm512_cvtepi32_ps(a.val)); }
1571
1572inline v_float32x16 v_cvt_f32(const v_float64x8& a)
1573{ return v_float32x16(_mm512_cvtpd_pslo(a.val)); }
1574
1575inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b)
1576{ return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }
1577
1578inline v_float64x8 v_cvt_f64(const v_int32x16& a)
1579{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }
1580
1581inline v_float64x8 v_cvt_f64_high(const v_int32x16& a)
1582{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }
1583
1584inline v_float64x8 v_cvt_f64(const v_float32x16& a)
1585{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }
1586
1587inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
1588{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
1589
1590// from (Mysticial and wim) https://stackoverflow.com/q/41144668
1591inline v_float64x8 v_cvt_f64(const v_int64x8& v)
1592{
1593#if CV_AVX_512DQ
1594 return v_float64x8(_mm512_cvtepi64_pd(v.val));
1595#else
1596 // constants encoded as floating-point
1597 __m512i magic_i_lo = _mm512_set1_epi64(0x4330000000000000); // 2^52
1598 __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63
1599 __m512i magic_i_all = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52
1600 __m512d magic_d_all = _mm512_castsi512_pd(magic_i_all);
1601
1602 // Blend the 32 lowest significant bits of v with magic_int_lo
1603 __m512i v_lo = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
1604 // Extract the 32 most significant bits of v
1605 __m512i v_hi = _mm512_srli_epi64(v.val, 32);
1606 // Flip the msb of v_hi and blend with 0x45300000
1607 v_hi = _mm512_xor_si512(v_hi, magic_i_hi32);
1608 // Compute in double precision
1609 __m512d v_hi_dbl = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
1610 // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
1611 __m512d result = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
1612 return v_float64x8(result);
1613#endif
1614}
1615
1617
1618inline v_int8x64 v512_lut(const schar* tab, const int* idx)
1619{
1620 __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 1));
1621 __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
1622 __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1));
1623 __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1));
1624 return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
1625}
1626inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx)
1627{
1628 __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 1));
1629 __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
1630 return v_int8x64(_v512_combine(p0, p1));
1631}
1632inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx)
1633{
1634 return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1));
1635}
1636inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); }
1637inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); }
1638inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); }
1639
1640inline v_int16x32 v512_lut(const short* tab, const int* idx)
1641{
1642 __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 2));
1643 __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2));
1644 return v_int16x32(_v512_combine(p0, p1));
1645}
1646inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx)
1647{
1648 return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2));
1649}
1650inline v_int16x32 v512_lut_quads(const short* tab, const int* idx)
1651{
1652#if defined(__GNUC__)
1653 return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2));
1654#else
1655 return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2));
1656#endif
1657}
1658inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); }
1659inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); }
1660inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); }
1661
1662inline v_int32x16 v512_lut(const int* tab, const int* idx)
1663{
1664 return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
1665}
1666inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx)
1667{
1668#if defined(__GNUC__)
1669 return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4));
1670#else
1671 return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4));
1672#endif
1673}
1674inline v_int32x16 v512_lut_quads(const int* tab, const int* idx)
1675{
1676 return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
1677 _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
1678 _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
1679 _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
1680 _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
1681}
1682inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); }
1683inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); }
1684inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); }
1685
1686inline v_int64x8 v512_lut(const int64* tab, const int* idx)
1687{
1688#if defined(__GNUC__)
1689 return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8));
1690#else
1691 return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8));
1692#endif
1693}
1694inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx)
1695{
1696 return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
1697 _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
1698 _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
1699 _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
1700 _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
1701}
1702inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); }
1703inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); }
1704
1705inline v_float32x16 v512_lut(const float* tab, const int* idx)
1706{
1707 return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
1708}
1709inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); }
1710inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); }
1711
1712inline v_float64x8 v512_lut(const double* tab, const int* idx)
1713{
1714 return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8));
1715}
1716inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx)
1717{
1718 return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
1719 _mm_loadu_pd(tab + idx[0])),
1720 _mm_loadu_pd(tab + idx[1]), 1),
1721 _mm_loadu_pd(tab + idx[2]), 2),
1722 _mm_loadu_pd(tab + idx[3]), 3));
1723}
1724
1725inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec)
1726{
1727 return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
1728}
1729
1730inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec)
1731{
1732 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
1733}
1734
1735inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec)
1736{
1737 return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
1738}
1739
1740inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec)
1741{
1742 return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
1743}
1744
1745inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)
1746{
1747 x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
1748 y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);
1749}
1750
1751inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)
1752{
1753 x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
1754 y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);
1755}
1756
1757inline v_int8x64 v_interleave_pairs(const v_int8x64& vec)
1758{
1759 return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
1760}
1761inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1762inline v_int8x64 v_interleave_quads(const v_int8x64& vec)
1763{
1764 return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
1765}
1766inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1767
1768inline v_int16x32 v_interleave_pairs(const v_int16x32& vec)
1769{
1770 return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
1771}
1772inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1773inline v_int16x32 v_interleave_quads(const v_int16x32& vec)
1774{
1775 return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
1776}
1777inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1778
1779inline v_int32x16 v_interleave_pairs(const v_int32x16& vec)
1780{
1781 return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
1782}
1783inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1784inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1785
1786inline v_int8x64 v_pack_triplets(const v_int8x64& vec)
1787{
1788 return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1789 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
1790 _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
1791}
1792inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1793
1794inline v_int16x32 v_pack_triplets(const v_int16x32& vec)
1795{
1796 return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
1797 0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
1798}
1799inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1800
1801inline v_int32x16 v_pack_triplets(const v_int32x16& vec)
1802{
1803 return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1804 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
1805}
1806inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
1807inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
1808{
1809 return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1810 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
1811}
1812
1814
1816
1817// 16 >> 32
1818inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
1819{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
1820inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
1821{ return v_dotprod(a, b) + c; }
1822
1823// 32 >> 64
1824inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
1825{
1826 __m512i even = _mm512_mul_epi32(a.val, b.val);
1827 __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
1828 return v_int64x8(_mm512_add_epi64(even, odd));
1829}
1830inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
1831{ return v_dotprod(a, b) + c; }
1832
1833// 8 >> 32
1834inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
1835{
1836 __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
1837 __m512i odd_a = _mm512_srli_epi16(a.val, 8);
1838
1839 __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
1840 __m512i odd_b = _mm512_srli_epi16(b.val, 8);
1841
1842 __m512i prod0 = _mm512_madd_epi16(even_a, even_b);
1843 __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
1844 return v_uint32x16(_mm512_add_epi32(prod0, prod1));
1845}
1846inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
1847{ return v_dotprod_expand(a, b) + c; }
1848
1849inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
1850{
1851 __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
1852 __m512i odd_a = _mm512_srai_epi16(a.val, 8);
1853
1854 __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
1855 __m512i odd_b = _mm512_srai_epi16(b.val, 8);
1856
1857 __m512i prod0 = _mm512_madd_epi16(even_a, even_b);
1858 __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
1859 return v_int32x16(_mm512_add_epi32(prod0, prod1));
1860}
1861inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
1862{ return v_dotprod_expand(a, b) + c; }
1863
1864// 16 >> 64
1865inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
1866{
1867 __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
1868 __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
1869 __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
1870 __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
1871
1872 __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
1873 __m512i p13 = _mm512_srli_epi64(mul0, 32);
1874 __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
1875 __m512i p57 = _mm512_srli_epi64(mul1, 32);
1876
1877 __m512i p15_ = _mm512_add_epi64(p02, p13);
1878 __m512i p9d_ = _mm512_add_epi64(p46, p57);
1879
1880 return v_uint64x8(_mm512_add_epi64(
1881 _mm512_unpacklo_epi64(p15_, p9d_),
1882 _mm512_unpackhi_epi64(p15_, p9d_)
1883 ));
1884}
1885inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
1886{ return v_dotprod_expand(a, b) + c; }
1887
1888inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
1889{
1890 __m512i prod = _mm512_madd_epi16(a.val, b.val);
1891 __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
1892 __m512i odd = _mm512_srai_epi64(prod, 32);
1893 return v_int64x8(_mm512_add_epi64(even, odd));
1894}
1895inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
1896{ return v_dotprod_expand(a, b) + c; }
1897
1898// 32 >> 64f
1899inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
1900{ return v_cvt_f64(v_dotprod(a, b)); }
1901inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
1902{ return v_dotprod_expand(a, b) + c; }
1903
1905
1906// 16 >> 32
1907inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)
1908{ return v_dotprod(a, b); }
1909inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
1910{ return v_dotprod(a, b, c); }
1911
1912// 32 >> 64
1913inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)
1914{ return v_dotprod(a, b); }
1915inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
1916{ return v_dotprod(a, b, c); }
1917
1918// 8 >> 32
1919inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)
1920{ return v_dotprod_expand(a, b); }
1921inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
1922{ return v_dotprod_expand(a, b, c); }
1923
1924inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)
1925{ return v_dotprod_expand(a, b); }
1926inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
1927{ return v_dotprod_expand(a, b, c); }
1928
1929// 16 >> 64
1930inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)
1931{
1932 __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
1933 __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
1934 __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
1935 __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
1936
1937 __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
1938 __m512i p13 = _mm512_srli_epi64(mul0, 32);
1939 __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
1940 __m512i p57 = _mm512_srli_epi64(mul1, 32);
1941
1942 __m512i p15_ = _mm512_add_epi64(p02, p13);
1943 __m512i p9d_ = _mm512_add_epi64(p46, p57);
1944 return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
1945}
1946inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
1947{ return v_dotprod_expand_fast(a, b) + c; }
1948
1949inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
1950{ return v_dotprod_expand(a, b); }
1951inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
1952{ return v_dotprod_expand(a, b, c); }
1953
1954// 32 >> 64f
1955inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
1956{ return v_dotprod_expand(a, b); }
1957inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
1958{ return v_dotprod_expand(a, b) + c; }
1959
1960
1961#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
1962 v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1963
1964inline v_float32x16 v_matmul(const v_float32x16& v,
1965 const v_float32x16& m0, const v_float32x16& m1,
1966 const v_float32x16& m2, const v_float32x16& m3)
1967{
1968 v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
1969 v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
1970 v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
1971 v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
1972 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
1973}
1974
1975inline v_float32x16 v_matmuladd(const v_float32x16& v,
1976 const v_float32x16& m0, const v_float32x16& m1,
1977 const v_float32x16& m2, const v_float32x16& a)
1978{
1979 v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
1980 v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
1981 v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
1982 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
1983}
1984
1985#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1986 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1987 const _Tpvec& a2, const _Tpvec& a3, \
1988 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1989 { \
1990 __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val)); \
1991 __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val)); \
1992 __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val)); \
1993 __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val)); \
1994 b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1)); \
1995 b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1)); \
1996 b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3)); \
1997 b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3)); \
1998 }
1999
2000OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2001OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2002OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)
2003
2004
2005
2006/* Expand */
2007#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
2008 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2009 { \
2010 b0.val = intrin(_v512_extract_low(a.val)); \
2011 b1.val = intrin(_v512_extract_high(a.val)); \
2012 } \
2013 inline _Tpwvec v_expand_low(const _Tpvec& a) \
2014 { return _Tpwvec(intrin(_v512_extract_low(a.val))); } \
2015 inline _Tpwvec v_expand_high(const _Tpvec& a) \
2016 { return _Tpwvec(intrin(_v512_extract_high(a.val))); } \
2017 inline _Tpwvec v512_load_expand(const _Tp* ptr) \
2018 { \
2019 __m256i a = _mm256_loadu_si256((const __m256i*)ptr); \
2020 return _Tpwvec(intrin(a)); \
2021 }
2022
2023OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64, v_uint16x32, uchar, _mm512_cvtepu8_epi16)
2024OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64, v_int16x32, schar, _mm512_cvtepi8_epi16)
2025OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort, _mm512_cvtepu16_epi32)
2026OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32, v_int32x16, short, _mm512_cvtepi16_epi32)
2027OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8, unsigned, _mm512_cvtepu32_epi64)
2028OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16, v_int64x8, int, _mm512_cvtepi32_epi64)
2029
2030#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
2031 inline _Tpvec v512_load_expand_q(const _Tp* ptr) \
2032 { \
2033 __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
2034 return _Tpvec(intrin(a)); \
2035 }
2036
2037OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)
2038OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16, schar, _mm512_cvtepi8_epi32)
2039
2040/* pack */
2041// 16
2042inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
2043{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
2044
2045inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
2046{
2047 const __m512i t = _mm512_set1_epi16(255);
2048 return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
2049}
2050
2051inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
2052{
2053 return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
2054}
2055
2056inline void v_pack_store(schar* ptr, const v_int16x32& a)
2057{ v_store_low(ptr, v_pack(a, a)); }
2058
2059inline void v_pack_store(uchar* ptr, const v_uint16x32& a)
2060{
2061 const __m512i m = _mm512_set1_epi16(255);
2062 _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
2063}
2064
2065inline void v_pack_u_store(uchar* ptr, const v_int16x32& a)
2066{ v_store_low(ptr, v_pack_u(a, a)); }
2067
2068template<int n> inline
2069v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
2070{
2071 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
2072 v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
2073 return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
2074 v_reinterpret_as_s16((b + delta) >> n));
2075}
2076
2077template<int n> inline
2078void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
2079{
2080 v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
2081 v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
2082}
2083
2084template<int n> inline
2085v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
2086{
2087 v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2088 return v_pack_u((a + delta) >> n, (b + delta) >> n);
2089}
2090
2091template<int n> inline
2092void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
2093{
2094 v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2095 v_pack_u_store(ptr, (a + delta) >> n);
2096}
2097
2098template<int n> inline
2099v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
2100{
2101 v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2102 return v_pack((a + delta) >> n, (b + delta) >> n);
2103}
2104
2105template<int n> inline
2106void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
2107{
2108 v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2109 v_pack_store(ptr, (a + delta) >> n);
2110}
2111
2112// 32
2113inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
2114{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
2115
2116inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
2117{
2118 const __m512i m = _mm512_set1_epi32(65535);
2119 return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
2120}
2121
2122inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
2123{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
2124
2125inline void v_pack_store(short* ptr, const v_int32x16& a)
2126{ v_store_low(ptr, v_pack(a, a)); }
2127
2128inline void v_pack_store(ushort* ptr, const v_uint32x16& a)
2129{
2130 const __m512i m = _mm512_set1_epi32(65535);
2131 _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
2132}
2133
2134inline void v_pack_u_store(ushort* ptr, const v_int32x16& a)
2135{ v_store_low(ptr, v_pack_u(a, a)); }
2136
2137
2138template<int n> inline
2139v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
2140{
2141 v_uint32x16 delta = v512_setall_u32(1 << (n-1));
2142 return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
2143 v_reinterpret_as_s32((b + delta) >> n));
2144}
2145
2146template<int n> inline
2147void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
2148{
2149 v_uint32x16 delta = v512_setall_u32(1 << (n-1));
2150 v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
2151}
2152
2153template<int n> inline
2154v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
2155{
2156 v_int32x16 delta = v512_setall_s32(1 << (n-1));
2157 return v_pack_u((a + delta) >> n, (b + delta) >> n);
2158}
2159
2160template<int n> inline
2161void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
2162{
2163 v_int32x16 delta = v512_setall_s32(1 << (n-1));
2164 v_pack_u_store(ptr, (a + delta) >> n);
2165}
2166
2167template<int n> inline
2168v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
2169{
2170 v_int32x16 delta = v512_setall_s32(1 << (n-1));
2171 return v_pack((a + delta) >> n, (b + delta) >> n);
2172}
2173
2174template<int n> inline
2175void v_rshr_pack_store(short* ptr, const v_int32x16& a)
2176{
2177 v_int32x16 delta = v512_setall_s32(1 << (n-1));
2178 v_pack_store(ptr, (a + delta) >> n);
2179}
2180
2181// 64
2182// Non-saturating pack
2183inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b)
2184{ return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }
2185
2186inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b)
2187{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2188
2189inline void v_pack_store(unsigned* ptr, const v_uint64x8& a)
2190{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }
2191
2192inline void v_pack_store(int* ptr, const v_int64x8& b)
2193{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
2194
2195template<int n> inline
2196v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
2197{
2198 v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
2199 return v_pack((a + delta) >> n, (b + delta) >> n);
2200}
2201
2202template<int n> inline
2203void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
2204{
2205 v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
2206 v_pack_store(ptr, (a + delta) >> n);
2207}
2208
2209template<int n> inline
2210v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
2211{
2212 v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
2213 return v_pack((a + delta) >> n, (b + delta) >> n);
2214}
2215
2216template<int n> inline
2217void v_rshr_pack_store(int* ptr, const v_int64x8& a)
2218{
2219 v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
2220 v_pack_store(ptr, (a + delta) >> n);
2221}
2222
2223// pack boolean
2224inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
2225{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
2226
2227inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
2228 const v_uint32x16& c, const v_uint32x16& d)
2229{
2230 __m512i ab = _mm512_packs_epi32(a.val, b.val);
2231 __m512i cd = _mm512_packs_epi32(c.val, d.val);
2232
2233 return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
2234}
2235
2236inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
2237 const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f,
2238 const v_uint64x8& g, const v_uint64x8& h)
2239{
2240 __m512i ab = _mm512_packs_epi32(a.val, b.val);
2241 __m512i cd = _mm512_packs_epi32(c.val, d.val);
2242 __m512i ef = _mm512_packs_epi32(e.val, f.val);
2243 __m512i gh = _mm512_packs_epi32(g.val, h.val);
2244
2245 __m512i abcd = _mm512_packs_epi32(ab, cd);
2246 __m512i efgh = _mm512_packs_epi32(ef, gh);
2247
2248 return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
2249 27, 19, 11, 3, 26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0), _mm512_packs_epi16(abcd, efgh)));
2250}
2251
2252/* Recombine */
2253// its up there with load and store operations
2254
2255/* Extract */
2256#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec) \
2257 template<int s> \
2258 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2259 { return v_rotate_right<s>(a, b); }
2260
2261OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
2262OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
2263OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
2264OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
2265OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
2266OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
2267OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
2268OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
2269OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
2270OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)
2271
2272#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
2273template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }
2274
2275OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)
2276OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)
2277OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)
2278OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short)
2279OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)
2280OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int)
2281OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)
2282OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)
2283OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float)
2284OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double)
2285
2286template<int i>
2287inline v_uint32x16 v_broadcast_element(v_uint32x16 a)
2288{
2289 static const __m512i perm = _mm512_set1_epi32((char)i);
2290 return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
2291}
2292
2293template<int i>
2294inline v_int32x16 v_broadcast_element(const v_int32x16 &a)
2295{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2296
2297template<int i>
2298inline v_float32x16 v_broadcast_element(const v_float32x16 &a)
2299{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2300
2301
2303
2304inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b )
2305{
2306 __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2307 __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2308#if CV_AVX_512VBMI
2309 __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
2310 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
2311 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2312 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2313 __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
2314 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
2315 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2316 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2317 a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
2318 b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
2319#else
2320 __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
2321 __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
2322 __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
2323 __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2324 __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2325 a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
2326 b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
2327#endif
2328}
2329
2330inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b )
2331{
2332 __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2333 __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2334 __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2335 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2336 __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2337 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2338 a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
2339 b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
2340}
2341
2342inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b )
2343{
2344 __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2345 __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2346 __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2347 __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2348 a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
2349 b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
2350}
2351
2352inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b )
2353{
2354 __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2355 __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
2356 __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2357 __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2358 a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
2359 b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
2360}
2361
2362inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c )
2363{
2364 __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2365 __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2366 __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
2367
2368#if CV_AVX_512VBMI2
2369 __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81,
2370 78, 75, 72, 69, 66, 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33,
2371 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 62, 59, 56, 53, 50,
2372 47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
2373 __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
2374 __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
2375 __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
2376 _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80,
2377 77, 74, 71, 68, 65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
2378 94, 91, 88, 85, 82, 79, 76, 73, 70, 67, 64, 61, 58, 55, 52, 49,
2379 46, 43, 40, 37, 34, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1), bgr2);
2380 a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
2381 b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
2382 c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
2383#elif CV_AVX_512VBMI
2384 __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
2385 __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
2386 __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
2387 a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80,
2388 77, 74, 71, 68, 65, 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48,
2389 46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24,
2390 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0), bgr2));
2391 b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40,
2392 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, 23, 21, 20, 18, 17,
2393 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 126, 123, 120, 117, 114,
2394 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81, 78, 75, 72, 69, 66), bgr0));
2395 c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18,
2396 15, 12, 9, 6, 3, 0, 125, 122, 119, 116, 113, 110, 107, 104, 101, 98,
2397 95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 62, 59, 56, 53, 50,
2398 47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2), bgr1));
2399#else
2400 __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
2401 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2402 __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
2403 __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
2404 __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
2405
2406 __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
2407 __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
2408 14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
2409 __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
2410 a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
2411 c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
2412 b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
2413#endif
2414}
2415
2416inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c )
2417{
2418 __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2419 __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2420 __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2421
2422 __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
2423 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2424 __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
2425 __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
2426 __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
2427
2428 a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
2429 b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
2430 14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
2431 c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
2432}
2433
2434inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )
2435{
2436 __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2437 __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2438 __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2439
2440 __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2441 __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
2442 __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
2443 __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);
2444
2445 a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
2446 b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
2447 c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
2448}
2449
2450inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c )
2451{
2452 __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2453 __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
2454 __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2455
2456 __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
2457 __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
2458 __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
2459 __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);
2460
2461 a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
2462 c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
2463 b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
2464}
2465
2466inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d )
2467{
2468 __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2469 __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2470 __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
2471 __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192));
2472
2473#if CV_AVX_512VBMI
2474 __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
2475 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
2476 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2477 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2478 __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
2479 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
2480 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2481 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2482
2483 __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
2484 __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
2485 __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
2486 __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
2487
2488 a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
2489 c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
2490 b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
2491 d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
2492#else
2493 __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
2494 __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
2495 __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
2496 __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
2497 __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);
2498
2499 __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2500 __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2501
2502 __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
2503 __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
2504 __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
2505 __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
2506
2507 a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
2508 c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
2509 b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
2510 d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
2511#endif
2512}
2513
2514inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d )
2515{
2516 __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2517 __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2518 __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2519 __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96));
2520
2521 __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2522 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2523 __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2524 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2525
2526 __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
2527 __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
2528 __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
2529 __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);
2530
2531 a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
2532 c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
2533 b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
2534 d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
2535}
2536
2537inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )
2538{
2539 __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2540 __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2541 __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2542 __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48));
2543
2544 __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2545 __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2546
2547 __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
2548 __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
2549 __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
2550 __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);
2551
2552 a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
2553 c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
2554 b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
2555 d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
2556}
2557
2558inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d )
2559{
2560 __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2561 __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
2562 __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2563 __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24));
2564
2565 __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2566 __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2567
2568 __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
2569 __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
2570 __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
2571 __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);
2572
2573 a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
2574 c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
2575 b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
2576 d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
2577}
2578
2580
2581inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y,
2583{
2584 v_uint8x64 low, high;
2585 v_zip(x, y, low, high);
2586 if( mode == hal::STORE_ALIGNED_NOCACHE )
2587 {
2588 _mm512_stream_si512((__m512i*)ptr, low.val);
2589 _mm512_stream_si512((__m512i*)(ptr + 64), high.val);
2590 }
2591 else if( mode == hal::STORE_ALIGNED )
2592 {
2593 _mm512_store_si512((__m512i*)ptr, low.val);
2594 _mm512_store_si512((__m512i*)(ptr + 64), high.val);
2595 }
2596 else
2597 {
2598 _mm512_storeu_si512((__m512i*)ptr, low.val);
2599 _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
2600 }
2601}
2602
2603inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y,
2605{
2606 v_uint16x32 low, high;
2607 v_zip(x, y, low, high);
2608 if( mode == hal::STORE_ALIGNED_NOCACHE )
2609 {
2610 _mm512_stream_si512((__m512i*)ptr, low.val);
2611 _mm512_stream_si512((__m512i*)(ptr + 32), high.val);
2612 }
2613 else if( mode == hal::STORE_ALIGNED )
2614 {
2615 _mm512_store_si512((__m512i*)ptr, low.val);
2616 _mm512_store_si512((__m512i*)(ptr + 32), high.val);
2617 }
2618 else
2619 {
2620 _mm512_storeu_si512((__m512i*)ptr, low.val);
2621 _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
2622 }
2623}
2624
2625inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y,
2627{
2628 v_uint32x16 low, high;
2629 v_zip(x, y, low, high);
2630 if( mode == hal::STORE_ALIGNED_NOCACHE )
2631 {
2632 _mm512_stream_si512((__m512i*)ptr, low.val);
2633 _mm512_stream_si512((__m512i*)(ptr + 16), high.val);
2634 }
2635 else if( mode == hal::STORE_ALIGNED )
2636 {
2637 _mm512_store_si512((__m512i*)ptr, low.val);
2638 _mm512_store_si512((__m512i*)(ptr + 16), high.val);
2639 }
2640 else
2641 {
2642 _mm512_storeu_si512((__m512i*)ptr, low.val);
2643 _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
2644 }
2645}
2646
2647inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y,
2649{
2650 v_uint64x8 low, high;
2651 v_zip(x, y, low, high);
2652 if( mode == hal::STORE_ALIGNED_NOCACHE )
2653 {
2654 _mm512_stream_si512((__m512i*)ptr, low.val);
2655 _mm512_stream_si512((__m512i*)(ptr + 8), high.val);
2656 }
2657 else if( mode == hal::STORE_ALIGNED )
2658 {
2659 _mm512_store_si512((__m512i*)ptr, low.val);
2660 _mm512_store_si512((__m512i*)(ptr + 8), high.val);
2661 }
2662 else
2663 {
2664 _mm512_storeu_si512((__m512i*)ptr, low.val);
2665 _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
2666 }
2667}
2668
2669inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c,
2671{
2672#if CV_AVX_512VBMI
2673 __m512i mask0 = _v512_set_epu8(127, 84, 20, 126, 83, 19, 125, 82, 18, 124, 81, 17, 123, 80, 16, 122,
2674 79, 15, 121, 78, 14, 120, 77, 13, 119, 76, 12, 118, 75, 11, 117, 74,
2675 10, 116, 73, 9, 115, 72, 8, 114, 71, 7, 113, 70, 6, 112, 69, 5,
2676 111, 68, 4, 110, 67, 3, 109, 66, 2, 108, 65, 1, 107, 64, 0, 106);
2677 __m512i mask1 = _v512_set_epu8( 21, 42, 105, 20, 41, 104, 19, 40, 103, 18, 39, 102, 17, 38, 101, 16,
2678 37, 100, 15, 36, 99, 14, 35, 98, 13, 34, 97, 12, 33, 96, 11, 32,
2679 95, 10, 31, 94, 9, 30, 93, 8, 29, 92, 7, 28, 91, 6, 27, 90,
2680 5, 26, 89, 4, 25, 88, 3, 24, 87, 2, 23, 86, 1, 22, 85, 0);
2681 __m512i mask2 = _v512_set_epu8(106, 127, 63, 105, 126, 62, 104, 125, 61, 103, 124, 60, 102, 123, 59, 101,
2682 122, 58, 100, 121, 57, 99, 120, 56, 98, 119, 55, 97, 118, 54, 96, 117,
2683 53, 95, 116, 52, 94, 115, 51, 93, 114, 50, 92, 113, 49, 91, 112, 48,
2684 90, 111, 47, 89, 110, 46, 88, 109, 45, 87, 108, 44, 86, 107, 43, 85);
2685 __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);
2686 __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);
2687 __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);
2688
2689 __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
2690 __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
2691 __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
2692#else
2693 __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
2694 __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);
2695 __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);
2696 __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);
2697
2698 __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37,
2699 5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0);
2700 __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
2701 47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
2702 __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
2703 26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
2704 __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
2705 __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
2706 __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);
2707
2708 __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
2709 __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
2710 __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
2711#endif
2712
2713 if( mode == hal::STORE_ALIGNED_NOCACHE )
2714 {
2715 _mm512_stream_si512((__m512i*)ptr, bgr0);
2716 _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
2717 _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
2718 }
2719 else if( mode == hal::STORE_ALIGNED )
2720 {
2721 _mm512_store_si512((__m512i*)ptr, bgr0);
2722 _mm512_store_si512((__m512i*)(ptr + 64), bgr1);
2723 _mm512_store_si512((__m512i*)(ptr + 128), bgr2);
2724 }
2725 else
2726 {
2727 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2728 _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
2729 _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
2730 }
2731}
2732
2733inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c,
2735{
2736 __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37,
2737 5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0);
2738 __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
2739 47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
2740 __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
2741 26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
2742 __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);
2743 __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);
2744 __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);
2745
2746 __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
2747 __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
2748 __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
2749
2750 if( mode == hal::STORE_ALIGNED_NOCACHE )
2751 {
2752 _mm512_stream_si512((__m512i*)ptr, bgr0);
2753 _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
2754 _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
2755 }
2756 else if( mode == hal::STORE_ALIGNED )
2757 {
2758 _mm512_store_si512((__m512i*)ptr, bgr0);
2759 _mm512_store_si512((__m512i*)(ptr + 32), bgr1);
2760 _mm512_store_si512((__m512i*)(ptr + 64), bgr2);
2761 }
2762 else
2763 {
2764 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2765 _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
2766 _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
2767 }
2768}
2769
2770inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c,
2772{
2773 __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
2774 __m512i mask1 = _v512_set_epu32(31, 10, 25, 30, 9, 24, 29, 8, 23, 28, 7, 22, 27, 6, 21, 26);
2775 __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);
2776 __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);
2777
2778 __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);
2779 __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
2780 __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);
2781
2782 if( mode == hal::STORE_ALIGNED_NOCACHE )
2783 {
2784 _mm512_stream_si512((__m512i*)ptr, bgr0);
2785 _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
2786 _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
2787 }
2788 else if( mode == hal::STORE_ALIGNED )
2789 {
2790 _mm512_store_si512((__m512i*)ptr, bgr0);
2791 _mm512_store_si512((__m512i*)(ptr + 16), bgr1);
2792 _mm512_store_si512((__m512i*)(ptr + 32), bgr2);
2793 }
2794 else
2795 {
2796 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2797 _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
2798 _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
2799 }
2800}
2801
2802inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
2804{
2805 __m512i mask0 = _v512_set_epu64( 5, 12, 7, 4, 11, 6, 3, 10);
2806 __m512i mask1 = _v512_set_epu64(15, 7, 4, 14, 6, 3, 13, 5);
2807 __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);
2808 __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);
2809
2810 __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);
2811 __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
2812 __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);
2813
2814 if( mode == hal::STORE_ALIGNED_NOCACHE )
2815 {
2816 _mm512_stream_si512((__m512i*)ptr, bgr0);
2817 _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
2818 _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
2819 }
2820 else if( mode == hal::STORE_ALIGNED )
2821 {
2822 _mm512_store_si512((__m512i*)ptr, bgr0);
2823 _mm512_store_si512((__m512i*)(ptr + 8), bgr1);
2824 _mm512_store_si512((__m512i*)(ptr + 16), bgr2);
2825 }
2826 else
2827 {
2828 _mm512_storeu_si512((__m512i*)ptr, bgr0);
2829 _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
2830 _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
2831 }
2832}
2833
2834inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b,
2835 const v_uint8x64& c, const v_uint8x64& d,
2837{
2838 v_uint8x64 br01, br23, ga01, ga23;
2839 v_zip(a, c, br01, br23);
2840 v_zip(b, d, ga01, ga23);
2841 v_uint8x64 bgra0, bgra1, bgra2, bgra3;
2842 v_zip(br01, ga01, bgra0, bgra1);
2843 v_zip(br23, ga23, bgra2, bgra3);
2844
2845 if( mode == hal::STORE_ALIGNED_NOCACHE )
2846 {
2847 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2848 _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
2849 _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
2850 _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
2851 }
2852 else if( mode == hal::STORE_ALIGNED )
2853 {
2854 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2855 _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
2856 _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
2857 _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
2858 }
2859 else
2860 {
2861 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2862 _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
2863 _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
2864 _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
2865 }
2866}
2867
2868inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b,
2869 const v_uint16x32& c, const v_uint16x32& d,
2871{
2872 v_uint16x32 br01, br23, ga01, ga23;
2873 v_zip(a, c, br01, br23);
2874 v_zip(b, d, ga01, ga23);
2875 v_uint16x32 bgra0, bgra1, bgra2, bgra3;
2876 v_zip(br01, ga01, bgra0, bgra1);
2877 v_zip(br23, ga23, bgra2, bgra3);
2878
2879 if( mode == hal::STORE_ALIGNED_NOCACHE )
2880 {
2881 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2882 _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
2883 _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
2884 _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
2885 }
2886 else if( mode == hal::STORE_ALIGNED )
2887 {
2888 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2889 _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
2890 _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
2891 _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
2892 }
2893 else
2894 {
2895 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2896 _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
2897 _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
2898 _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
2899 }
2900}
2901
2902inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b,
2903 const v_uint32x16& c, const v_uint32x16& d,
2905{
2906 v_uint32x16 br01, br23, ga01, ga23;
2907 v_zip(a, c, br01, br23);
2908 v_zip(b, d, ga01, ga23);
2909 v_uint32x16 bgra0, bgra1, bgra2, bgra3;
2910 v_zip(br01, ga01, bgra0, bgra1);
2911 v_zip(br23, ga23, bgra2, bgra3);
2912
2913 if( mode == hal::STORE_ALIGNED_NOCACHE )
2914 {
2915 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2916 _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
2917 _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
2918 _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
2919 }
2920 else if( mode == hal::STORE_ALIGNED )
2921 {
2922 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2923 _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
2924 _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
2925 _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
2926 }
2927 else
2928 {
2929 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2930 _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
2931 _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
2932 _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
2933 }
2934}
2935
2936inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b,
2937 const v_uint64x8& c, const v_uint64x8& d,
2939{
2940 v_uint64x8 br01, br23, ga01, ga23;
2941 v_zip(a, c, br01, br23);
2942 v_zip(b, d, ga01, ga23);
2943 v_uint64x8 bgra0, bgra1, bgra2, bgra3;
2944 v_zip(br01, ga01, bgra0, bgra1);
2945 v_zip(br23, ga23, bgra2, bgra3);
2946
2947 if( mode == hal::STORE_ALIGNED_NOCACHE )
2948 {
2949 _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2950 _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
2951 _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
2952 _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
2953 }
2954 else if( mode == hal::STORE_ALIGNED )
2955 {
2956 _mm512_store_si512((__m512i*)ptr, bgra0.val);
2957 _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
2958 _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
2959 _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
2960 }
2961 else
2962 {
2963 _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2964 _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
2965 _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
2966 _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
2967 }
2968}
2969
2970#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2971inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2972{ \
2973 _Tpvec1 a1, b1; \
2974 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2975 a0 = v_reinterpret_as_##suffix0(a1); \
2976 b0 = v_reinterpret_as_##suffix0(b1); \
2977} \
2978inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2979{ \
2980 _Tpvec1 a1, b1, c1; \
2981 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2982 a0 = v_reinterpret_as_##suffix0(a1); \
2983 b0 = v_reinterpret_as_##suffix0(b1); \
2984 c0 = v_reinterpret_as_##suffix0(c1); \
2985} \
2986inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2987{ \
2988 _Tpvec1 a1, b1, c1, d1; \
2989 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2990 a0 = v_reinterpret_as_##suffix0(a1); \
2991 b0 = v_reinterpret_as_##suffix0(b1); \
2992 c0 = v_reinterpret_as_##suffix0(c1); \
2993 d0 = v_reinterpret_as_##suffix0(d1); \
2994} \
2995inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2996 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2997{ \
2998 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2999 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3000 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
3001} \
3002inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
3003 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3004{ \
3005 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3006 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3007 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3008 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
3009} \
3010inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3011 const _Tpvec0& c0, const _Tpvec0& d0, \
3012 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3013{ \
3014 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3015 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3016 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3017 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
3018 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3019}
3020
3021OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)
3022OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16)
3023OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32)
3024OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32)
3025OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)
3026OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64)
3027
3028
3029
3030
3031inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }
3032inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3033inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3034inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3035
3036inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); }
3037inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); }
3038inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
3039inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
3040inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
3041inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
3042
3044inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3045inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }
3046inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3047inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3048inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3049inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3050inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3051inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3052
3053inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
3054inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
3055inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
3056inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
3057inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); }
3058inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); }
3059inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
3060inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
3061inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); }
3062inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); }
3063inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
3064inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
3065
3066inline int v_scan_forward(const v_int8x64& a)
3067{
3068 int64 mask = _mm512_movepi8_mask(a.val);
3069 int mask32 = (int)mask;
3070 return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
3071}
3072inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }
3073inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
3074inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
3075inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
3076inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
3077inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
3078inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
3079inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
3080inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
3081
3082inline void v512_cleanup() { _mm256_zeroall(); }
3083
3084CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3085
3087
3088} // cv::
3089
3090#endif // OPENCV_HAL_INTRIN_AVX_HPP
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr const CvArr CvArr * result
Definition core_c.h:1423
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition intrin_cpp.hpp:1554
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition intrin_cpp.hpp:2216
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition intrin_cpp.hpp:1496
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition intrin_cpp.hpp:1474
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition intrin_cpp.hpp:1515
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
CvSize int int int CvPoint int delta
Definition imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
OutputArray sum
Definition imgproc.hpp:2882
StoreMode
Definition intrin.hpp:100
@ STORE_ALIGNED_NOCACHE
Definition intrin.hpp:103
@ STORE_ALIGNED
Definition intrin.hpp:102
@ STORE_UNALIGNED
Definition intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition dualquaternion.inl.hpp:274