EstervQrCode 1.1.1
Library for qr code manipulation
intrin_avx512.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4 
5 #ifndef OPENCV_HAL_INTRIN_AVX512_HPP
6 #define OPENCV_HAL_INTRIN_AVX512_HPP
7 
8 #if defined(_MSC_VER) && (_MSC_VER < 1920/*MSVS2019*/)
9 # pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned
10 # pragma warning(disable:4309) // 'argument': truncation of constant value
11 # pragma warning(disable:4310) // cast truncates constant value
12 #endif
13 
14 #define CVT_ROUND_MODES_IMPLEMENTED 0
15 
16 #define CV_SIMD512 1
17 #define CV_SIMD512_64F 1
18 #define CV_SIMD512_FP16 0 // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
19 
20 #define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
21 #define _v512_set_epu32(a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
22  _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
23  ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
24 #define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
25  a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
26  _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
27  ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
28  ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
29  ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
30 #define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
31  a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
32  a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
33  a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
34  _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
35  ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
36  ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
37  ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
38  ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
39  ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
40  ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
41  ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
42 #define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
43  a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
44  a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
45  a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \
46  _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
47  (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
48  (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
49  (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
50  (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
51  (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
52  (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
53  (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))
54 
55 #ifndef _mm512_cvtpd_pslo
56 #ifdef _mm512_zextsi256_si512
57 #define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
58 #else
59 //if preferred way to extend with zeros is unavailable
60 #define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
61 #endif
62 #endif
64 
65 namespace
66 {
67 
68 inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi)
69 { return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }
70 
71 inline __m512 _v512_combine(const __m256& lo, const __m256& hi)
72 { return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }
73 
74 inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi)
75 { return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }
76 
77 inline int _v_cvtsi512_si32(const __m512i& a)
78 { return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }
79 
80 inline __m256i _v512_extract_high(const __m512i& v)
81 { return _mm512_extracti32x8_epi32(v, 1); }
82 
83 inline __m256 _v512_extract_high(const __m512& v)
84 { return _mm512_extractf32x8_ps(v, 1); }
85 
86 inline __m256d _v512_extract_high(const __m512d& v)
87 { return _mm512_extractf64x4_pd(v, 1); }
88 
89 inline __m256i _v512_extract_low(const __m512i& v)
90 { return _mm512_castsi512_si256(v); }
91 
92 inline __m256 _v512_extract_low(const __m512& v)
93 { return _mm512_castps512_ps256(v); }
94 
95 inline __m256d _v512_extract_low(const __m512d& v)
96 { return _mm512_castpd512_pd256(v); }
97 
98 inline __m512i _v512_insert(const __m512i& a, const __m256i& b)
99 { return _mm512_inserti32x8(a, b, 0); }
100 
101 inline __m512 _v512_insert(const __m512& a, const __m256& b)
102 { return _mm512_insertf32x8(a, b, 0); }
103 
104 inline __m512d _v512_insert(const __m512d& a, const __m256d& b)
105 { return _mm512_insertf64x4(a, b, 0); }
106 
107 }
108 
109 namespace cv
110 {
111 
113 
114 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
115 
117 
118 struct v_uint8x64
119 {
120  typedef uchar lane_type;
121  enum { nlanes = 64 };
122  __m512i val;
123 
124  explicit v_uint8x64(__m512i v) : val(v) {}
125  v_uint8x64(uchar v0, uchar v1, uchar v2, uchar v3,
126  uchar v4, uchar v5, uchar v6, uchar v7,
127  uchar v8, uchar v9, uchar v10, uchar v11,
128  uchar v12, uchar v13, uchar v14, uchar v15,
129  uchar v16, uchar v17, uchar v18, uchar v19,
130  uchar v20, uchar v21, uchar v22, uchar v23,
131  uchar v24, uchar v25, uchar v26, uchar v27,
132  uchar v28, uchar v29, uchar v30, uchar v31,
133  uchar v32, uchar v33, uchar v34, uchar v35,
134  uchar v36, uchar v37, uchar v38, uchar v39,
135  uchar v40, uchar v41, uchar v42, uchar v43,
136  uchar v44, uchar v45, uchar v46, uchar v47,
137  uchar v48, uchar v49, uchar v50, uchar v51,
138  uchar v52, uchar v53, uchar v54, uchar v55,
139  uchar v56, uchar v57, uchar v58, uchar v59,
140  uchar v60, uchar v61, uchar v62, uchar v63)
141  {
142  val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
143  v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
144  v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
145  v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
146  }
147  v_uint8x64() {}
148 
149  static inline v_uint8x64 zero() { return v_uint8x64(_mm512_setzero_si512()); }
150 
151  uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }
152 };
153 
154 struct v_int8x64
155 {
156  typedef schar lane_type;
157  enum { nlanes = 64 };
158  __m512i val;
159 
160  explicit v_int8x64(__m512i v) : val(v) {}
161  v_int8x64(schar v0, schar v1, schar v2, schar v3,
162  schar v4, schar v5, schar v6, schar v7,
163  schar v8, schar v9, schar v10, schar v11,
164  schar v12, schar v13, schar v14, schar v15,
165  schar v16, schar v17, schar v18, schar v19,
166  schar v20, schar v21, schar v22, schar v23,
167  schar v24, schar v25, schar v26, schar v27,
168  schar v28, schar v29, schar v30, schar v31,
169  schar v32, schar v33, schar v34, schar v35,
170  schar v36, schar v37, schar v38, schar v39,
171  schar v40, schar v41, schar v42, schar v43,
172  schar v44, schar v45, schar v46, schar v47,
173  schar v48, schar v49, schar v50, schar v51,
174  schar v52, schar v53, schar v54, schar v55,
175  schar v56, schar v57, schar v58, schar v59,
176  schar v60, schar v61, schar v62, schar v63)
177  {
178  val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
179  v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
180  v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
181  v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
182  }
183  v_int8x64() {}
184 
185  static inline v_int8x64 zero() { return v_int8x64(_mm512_setzero_si512()); }
186 
187  schar get0() const { return (schar)_v_cvtsi512_si32(val); }
188 };
189 
190 struct v_uint16x32
191 {
192  typedef ushort lane_type;
193  enum { nlanes = 32 };
194  __m512i val;
195 
196  explicit v_uint16x32(__m512i v) : val(v) {}
197  v_uint16x32(ushort v0, ushort v1, ushort v2, ushort v3,
198  ushort v4, ushort v5, ushort v6, ushort v7,
199  ushort v8, ushort v9, ushort v10, ushort v11,
200  ushort v12, ushort v13, ushort v14, ushort v15,
201  ushort v16, ushort v17, ushort v18, ushort v19,
202  ushort v20, ushort v21, ushort v22, ushort v23,
203  ushort v24, ushort v25, ushort v26, ushort v27,
204  ushort v28, ushort v29, ushort v30, ushort v31)
205  {
206  val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
207  v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0);
208  }
209  v_uint16x32() {}
210 
211  static inline v_uint16x32 zero() { return v_uint16x32(_mm512_setzero_si512()); }
212 
213  ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }
214 };
215 
216 struct v_int16x32
217 {
218  typedef short lane_type;
219  enum { nlanes = 32 };
220  __m512i val;
221 
222  explicit v_int16x32(__m512i v) : val(v) {}
223  v_int16x32(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7,
224  short v8, short v9, short v10, short v11, short v12, short v13, short v14, short v15,
225  short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23,
226  short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31)
227  {
228  val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,
229  (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,
230  (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
231  (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
232  }
233  v_int16x32() {}
234 
235  static inline v_int16x32 zero() { return v_int16x32(_mm512_setzero_si512()); }
236 
237  short get0() const { return (short)_v_cvtsi512_si32(val); }
238 };
239 
240 struct v_uint32x16
241 {
242  typedef unsigned lane_type;
243  enum { nlanes = 16 };
244  __m512i val;
245 
246  explicit v_uint32x16(__m512i v) : val(v) {}
247  v_uint32x16(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
248  unsigned v4, unsigned v5, unsigned v6, unsigned v7,
249  unsigned v8, unsigned v9, unsigned v10, unsigned v11,
250  unsigned v12, unsigned v13, unsigned v14, unsigned v15)
251  {
252  val = _mm512_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3, (int)v4, (int)v5, (int)v6, (int)v7,
253  (int)v8, (int)v9, (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);
254  }
255  v_uint32x16() {}
256 
257  static inline v_uint32x16 zero() { return v_uint32x16(_mm512_setzero_si512()); }
258 
259  unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }
260 };
261 
262 struct v_int32x16
263 {
264  typedef int lane_type;
265  enum { nlanes = 16 };
266  __m512i val;
267 
268  explicit v_int32x16(__m512i v) : val(v) {}
269  v_int32x16(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
270  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15)
271  {
272  val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
273  }
274  v_int32x16() {}
275 
276  static inline v_int32x16 zero() { return v_int32x16(_mm512_setzero_si512()); }
277 
278  int get0() const { return _v_cvtsi512_si32(val); }
279 };
280 
281 struct v_float32x16
282 {
283  typedef float lane_type;
284  enum { nlanes = 16 };
285  __m512 val;
286 
287  explicit v_float32x16(__m512 v) : val(v) {}
288  v_float32x16(float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7,
289  float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15)
290  {
291  val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
292  }
293  v_float32x16() {}
294 
295  static inline v_float32x16 zero() { return v_float32x16(_mm512_setzero_ps()); }
296 
297  float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
298 };
299 
300 struct v_uint64x8
301 {
302  typedef uint64 lane_type;
303  enum { nlanes = 8 };
304  __m512i val;
305 
306  explicit v_uint64x8(__m512i v) : val(v) {}
307  v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
308  { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
309  v_uint64x8() {}
310 
311  static inline v_uint64x8 zero() { return v_uint64x8(_mm512_setzero_si512()); }
312 
313  uint64 get0() const
314  {
315  #if defined __x86_64__ || defined _M_X64
316  return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
317  #else
318  int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
319  int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
320  return (unsigned)a | ((uint64)(unsigned)b << 32);
321  #endif
322  }
323 };
324 
325 struct v_int64x8
326 {
327  typedef int64 lane_type;
328  enum { nlanes = 8 };
329  __m512i val;
330 
331  explicit v_int64x8(__m512i v) : val(v) {}
332  v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
333  { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
334  v_int64x8() {}
335 
336  static inline v_int64x8 zero() { return v_int64x8(_mm512_setzero_si512()); }
337 
338  int64 get0() const
339  {
340  #if defined __x86_64__ || defined _M_X64
341  return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
342  #else
343  int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
344  int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
345  return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
346  #endif
347  }
348 };
349 
350 struct v_float64x8
351 {
352  typedef double lane_type;
353  enum { nlanes = 8 };
354  __m512d val;
355 
356  explicit v_float64x8(__m512d v) : val(v) {}
357  v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
358  { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
359  v_float64x8() {}
360 
361  static inline v_float64x8 zero() { return v_float64x8(_mm512_setzero_pd()); }
362 
363  double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
364 };
365 
367 
368 #define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp) \
369  inline _Tpvec v512_load(const _Tp* ptr) \
370  { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); } \
371  inline _Tpvec v512_load_aligned(const _Tp* ptr) \
372  { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); } \
373  inline _Tpvec v512_load_low(const _Tp* ptr) \
374  { \
375  __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr); \
376  return _Tpvec(_mm512_castsi256_si512(v256)); \
377  } \
378  inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
379  { \
380  __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0); \
381  __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1); \
382  return _Tpvec(_v512_combine(vlo, vhi)); \
383  } \
384  inline void v_store(_Tp* ptr, const _Tpvec& a) \
385  { _mm512_storeu_si512((__m512i*)ptr, a.val); } \
386  inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
387  { _mm512_store_si512((__m512i*)ptr, a.val); } \
388  inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
389  { _mm512_stream_si512((__m512i*)ptr, a.val); } \
390  inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
391  { \
392  if( mode == hal::STORE_UNALIGNED ) \
393  _mm512_storeu_si512((__m512i*)ptr, a.val); \
394  else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
395  _mm512_stream_si512((__m512i*)ptr, a.val); \
396  else \
397  _mm512_store_si512((__m512i*)ptr, a.val); \
398  } \
399  inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
400  { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); } \
401  inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
402  { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }
403 
404 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64, uchar)
405 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64, schar)
406 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)
407 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32, short)
408 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16, unsigned)
409 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16, int)
410 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8, uint64)
411 OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8, int64)
412 
413 #define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
414  inline _Tpvec v512_load(const _Tp* ptr) \
415  { return _Tpvec(_mm512_loadu_##suffix(ptr)); } \
416  inline _Tpvec v512_load_aligned(const _Tp* ptr) \
417  { return _Tpvec(_mm512_load_##suffix(ptr)); } \
418  inline _Tpvec v512_load_low(const _Tp* ptr) \
419  { \
420  return _Tpvec(_mm512_cast##suffix##256_##suffix##512 \
421  (_mm256_loadu_##suffix(ptr))); \
422  } \
423  inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
424  { \
425  halfreg vlo = _mm256_loadu_##suffix(ptr0); \
426  halfreg vhi = _mm256_loadu_##suffix(ptr1); \
427  return _Tpvec(_v512_combine(vlo, vhi)); \
428  } \
429  inline void v_store(_Tp* ptr, const _Tpvec& a) \
430  { _mm512_storeu_##suffix(ptr, a.val); } \
431  inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
432  { _mm512_store_##suffix(ptr, a.val); } \
433  inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
434  { _mm512_stream_##suffix(ptr, a.val); } \
435  inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
436  { \
437  if( mode == hal::STORE_UNALIGNED ) \
438  _mm512_storeu_##suffix(ptr, a.val); \
439  else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
440  _mm512_stream_##suffix(ptr, a.val); \
441  else \
442  _mm512_store_##suffix(ptr, a.val); \
443  } \
444  inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
445  { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); } \
446  inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
447  { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }
448 
449 OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float, ps, __m256)
450 OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
451 
452 #define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
453  inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
454  { return _Tpvec(cast(a.val)); }
455 
456 #define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
457  inline _Tpvec v512_setzero_##suffix() \
458  { return _Tpvec(_mm512_setzero_si512()); } \
459  inline _Tpvec v512_setall_##suffix(_Tp v) \
460  { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); } \
461  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, OPENCV_HAL_NOP) \
462  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, OPENCV_HAL_NOP) \
463  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, OPENCV_HAL_NOP) \
464  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, OPENCV_HAL_NOP) \
465  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, OPENCV_HAL_NOP) \
466  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, OPENCV_HAL_NOP) \
467  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, OPENCV_HAL_NOP) \
468  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, OPENCV_HAL_NOP) \
469  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
470  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8, suffix, _mm512_castpd_si512)
471 
472 OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64, uchar, u8, epi8, char)
473 OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64, schar, s8, epi8, char)
474 OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort, u16, epi16, short)
475 OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32, short, s16, epi16, short)
476 OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32, int)
477 OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16, int, s32, epi32, int)
478 OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8, uint64, u64, epi64, int64)
479 OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8, int64, s64, epi64, int64)
480 
481 #define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
482  inline _Tpvec v512_setzero_##suffix() \
483  { return _Tpvec(_mm512_setzero_##zsuffix()); } \
484  inline _Tpvec v512_setall_##suffix(_Tp v) \
485  { return _Tpvec(_mm512_set1_##zsuffix(v)); } \
486  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, cast) \
487  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, cast) \
488  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast) \
489  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, cast) \
490  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast) \
491  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, cast) \
492  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, cast) \
493  OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, cast)
494 
495 OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float, f32, ps, _mm512_castsi512_ps)
496 OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8, double, f64, pd, _mm512_castsi512_pd)
497 
498 inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a)
499 { return a; }
500 inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a)
501 { return v_float32x16(_mm512_castpd_ps(a.val)); }
502 
503 inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a)
504 { return a; }
505 inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
506 { return v_float64x8(_mm512_castps_pd(a.val)); }
507 
508 // FP16
509 inline v_float32x16 v512_load_expand(const hfloat* ptr)
510 {
511  return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
512 }
513 
514 inline void v_pack_store(hfloat* ptr, const v_float32x16& a)
515 {
516  __m256i ah = _mm512_cvtps_ph(a.val, 0);
517  _mm256_storeu_si256((__m256i*)ptr, ah);
518 }
519 
520 /* Recombine & ZIP */
521 inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
522 {
523 #if CV_AVX_512VBMI
524  __m512i mask0 = _v512_set_epu8( 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24,
525  87, 23, 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16,
526  79, 15, 78, 14, 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8,
527  71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, 64, 0);
528  ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
529  __m512i mask1 = _v512_set_epu8(127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
530  119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
531  111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
532  103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32);
533  ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
534 #else
535  __m512i low = _mm512_unpacklo_epi8(a.val, b.val);
536  __m512i high = _mm512_unpackhi_epi8(a.val, b.val);
537  ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2, 9, 8, 1, 0), high));
538  ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
539 #endif
540 }
541 inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
542 {
543  __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8,
544  39, 7, 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0);
545  ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
546  __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
547  55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
548  ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
549 }
550 inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
551 {
552  __m512i mask0 = _v512_set_epu32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
553  ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
554  __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
555  ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
556 }
557 inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
558 {
559  __m512i mask0 = _v512_set_epu64(11, 3, 10, 2, 9, 1, 8, 0);
560  ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
561  __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
562  ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
563 }
564 
565 inline void v_zip(const v_uint8x64& a, const v_uint8x64& b, v_uint8x64& ab0, v_uint8x64& ab1)
566 {
567  v_int8x64 i0, i1;
568  v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
569  ab0 = v_reinterpret_as_u8(i0);
570  ab1 = v_reinterpret_as_u8(i1);
571 }
572 inline void v_zip(const v_uint16x32& a, const v_uint16x32& b, v_uint16x32& ab0, v_uint16x32& ab1)
573 {
574  v_int16x32 i0, i1;
575  v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
576  ab0 = v_reinterpret_as_u16(i0);
577  ab1 = v_reinterpret_as_u16(i1);
578 }
579 inline void v_zip(const v_uint32x16& a, const v_uint32x16& b, v_uint32x16& ab0, v_uint32x16& ab1)
580 {
581  v_int32x16 i0, i1;
582  v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
583  ab0 = v_reinterpret_as_u32(i0);
584  ab1 = v_reinterpret_as_u32(i1);
585 }
586 inline void v_zip(const v_uint64x8& a, const v_uint64x8& b, v_uint64x8& ab0, v_uint64x8& ab1)
587 {
588  v_int64x8 i0, i1;
589  v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
590  ab0 = v_reinterpret_as_u64(i0);
591  ab1 = v_reinterpret_as_u64(i1);
592 }
593 inline void v_zip(const v_float32x16& a, const v_float32x16& b, v_float32x16& ab0, v_float32x16& ab1)
594 {
595  v_int32x16 i0, i1;
596  v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
597  ab0 = v_reinterpret_as_f32(i0);
598  ab1 = v_reinterpret_as_f32(i1);
599 }
600 inline void v_zip(const v_float64x8& a, const v_float64x8& b, v_float64x8& ab0, v_float64x8& ab1)
601 {
602  v_int64x8 i0, i1;
603  v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
604  ab0 = v_reinterpret_as_f64(i0);
605  ab1 = v_reinterpret_as_f64(i1);
606 }
607 
608 #define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix) \
609  inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
610  { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
611  inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
612  { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); } \
613  inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
614  _Tpvec& c, _Tpvec& d) \
615  { \
616  c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val)); \
617  d.val = _v512_insert(b.val,_v512_extract_high(a.val)); \
618  }
619 
620 
621 OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64, epi8)
622 OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64, epi8)
623 OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32, epi16)
624 OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32, epi16)
625 OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16, epi32)
626 OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16, epi32)
627 OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8, epi64)
628 OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8, epi64)
629 OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
630 OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8, pd)
631 
632 
634 /* Element-wise binary and unary operations */
635 
636 
637 #define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
638  inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
639  { return _Tpvec(intrin(a.val, b.val)); }
640 
641 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
642 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
643 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
644 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
645 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
646 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
647 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
648 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
649 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
650 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)
651 
652 inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b)
653 {
654  __m512i ad = _mm512_srai_epi16(a.val, 8);
655  __m512i bd = _mm512_srai_epi16(b.val, 8);
656  __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even
657  __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd
658  return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
659 }
660 inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
661 {
662  return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
663 }
664 
665 #define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \
666  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
667  { return _Tpvec(intrin(a.val, b.val)); } \
668  inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
669  { a.val = intrin(a.val, b.val); return a; }
670 
671 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
672 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
673 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
674 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
675 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
676 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
677 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
678 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
679 
680 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
681 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
682 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
683 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
684 
685 
686 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8)
687 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8)
688 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8)
689 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8)
690 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
691 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
692 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16)
693 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16)
694 
695 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
696 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
697 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
698 OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
699 OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
700 OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
701 OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
702 OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
703 
704 // saturating multiply
705 inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
706 {
707  v_uint16x32 c, d;
708  v_mul_expand(a, b, c, d);
709  return v_pack(c, d);
710 }
711 inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
712 {
713  v_int16x32 c, d;
714  v_mul_expand(a, b, c, d);
715  return v_pack(c, d);
716 }
717 inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
718 {
719  __m512i pl = _mm512_mullo_epi16(a.val, b.val);
720  __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
721  __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
722  __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
723 
724  const __m512i m = _mm512_set1_epi32(65535);
725  return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
726 }
727 inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
728 {
729  __m512i pl = _mm512_mullo_epi16(a.val, b.val);
730  __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
731  __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
732  __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
733  return v_int16x32(_mm512_packs_epi32(p0, p1));
734 }
735 
736 inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
737 { a = a * b; return a; }
738 inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
739 { a = a * b; return a; }
740 inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
741 { a = a * b; return a; }
742 inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
743 { a = a * b; return a; }
744 
745 inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
746 inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
747 
748 // Multiply and expand
749 inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b,
750  v_uint16x32& c, v_uint16x32& d)
751 {
752  v_uint16x32 a0, a1, b0, b1;
753  v_expand(a, a0, a1);
754  v_expand(b, b0, b1);
755  c = v_mul_wrap(a0, b0);
756  d = v_mul_wrap(a1, b1);
757 }
758 
759 inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b,
760  v_int16x32& c, v_int16x32& d)
761 {
762  v_int16x32 a0, a1, b0, b1;
763  v_expand(a, a0, a1);
764  v_expand(b, b0, b1);
765  c = v_mul_wrap(a0, b0);
766  d = v_mul_wrap(a1, b1);
767 }
768 
769 inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b,
770  v_int32x16& c, v_int32x16& d)
771 {
772  v_int16x32 v0, v1;
773  v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
774 
775  c = v_reinterpret_as_s32(v0);
776  d = v_reinterpret_as_s32(v1);
777 }
778 
779 inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b,
780  v_uint32x16& c, v_uint32x16& d)
781 {
782  v_uint16x32 v0, v1;
783  v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
784 
785  c = v_reinterpret_as_u32(v0);
786  d = v_reinterpret_as_u32(v1);
787 }
788 
789 inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b,
790  v_uint64x8& c, v_uint64x8& d)
791 {
792  v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
793  v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
794 }
795 
796 inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
797  v_int64x8& c, v_int64x8& d)
798 {
799  v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
800  v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
801 }
802 
804 #define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
805  inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
806  { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
807  inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
808  { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
809  inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
810  { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
811  inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
812  { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \
813  template<int imm> \
814  inline _Tpuvec v_shl(const _Tpuvec& a) \
815  { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \
816  template<int imm> \
817  inline _Tpsvec v_shl(const _Tpsvec& a) \
818  { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \
819  template<int imm> \
820  inline _Tpuvec v_shr(const _Tpuvec& a) \
821  { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \
822  template<int imm> \
823  inline _Tpsvec v_shr(const _Tpsvec& a) \
824  { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }
825 
826 OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
827 OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
828 OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64)
829 
830 
831 
832 #define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
833  OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \
834  OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \
835  OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \
836  inline _Tpvec operator ~ (const _Tpvec& a) \
837  { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
838 
839 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1))
840 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64, si512, _mm512_set1_epi32(-1))
841 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32, si512, _mm512_set1_epi32(-1))
842 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32, si512, _mm512_set1_epi32(-1))
843 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16, si512, _mm512_set1_epi32(-1))
844 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16, si512, _mm512_set1_epi32(-1))
845 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8, si512, _mm512_set1_epi64(-1))
846 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8, si512, _mm512_set1_epi64(-1))
847 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps, _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
848 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8, pd, _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
849 
851 #define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf) \
852  inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
853  { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }
854 
855 OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64, epi8, si512)
856 OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64, epi8, si512)
857 OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
858 OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32, epi16, si512)
859 OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
860 OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16, epi32, si512)
861 OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8, epi64, si512)
862 OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8, epi64, si512)
863 OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16, ps, ps)
864 OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd)
865 
866 
867 #define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
868  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
869  { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
870 
871 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \
872  OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \
873  OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \
874  OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \
875  OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
876  OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \
877  OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
878 
879 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (char)-1)
880 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (char)-1)
881 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)
882 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32, epi16, epi16, (short)-1)
883 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)
884 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16, epi32, epi32, (int)-1)
885 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (int64)-1)
886 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (int64)-1)
887 
888 #define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
889  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
890  { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
891 
892 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \
893  OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \
894  OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
895  OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \
896  OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \
897  OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \
898  OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval)
899 
900 OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
901 OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (int64)-1)
902 
903 inline v_float32x16 v_not_nan(const v_float32x16& a)
904 { return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }
905 inline v_float64x8 v_not_nan(const v_float64x8& a)
906 { return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }
907 
909 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64, _mm512_min_epu8)
910 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64, _mm512_max_epu8)
911 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64, _mm512_min_epi8)
912 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64, _mm512_max_epi8)
913 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32, _mm512_min_epu16)
914 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32, _mm512_max_epu16)
915 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32, _mm512_min_epi16)
916 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32, _mm512_max_epi16)
917 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16, _mm512_min_epu32)
918 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16, _mm512_max_epu32)
919 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16, _mm512_min_epi32)
920 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16, _mm512_max_epi32)
921 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8, _mm512_min_epu64)
922 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8, _mm512_max_epu64)
923 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8, _mm512_min_epi64)
924 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8, _mm512_max_epi64)
925 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
926 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
927 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8, _mm512_min_pd)
928 OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8, _mm512_max_pd)
929 
930 
931 namespace {
932  template<bool prec, int imm4, bool part, int imm32>
933  struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
934  template<int imm4, int imm32>
935  struct _v_rotate_right<true, imm4, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
936  {
937  return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 ), imm4 *8),
938  _mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));
939  }};
940  template<int imm4>
941  struct _v_rotate_right<true, imm4, false, 15> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
942  {
943  return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15), imm4 *8),
944  _mm512_slli_epi32( b.val, (4-imm4)*8)));
945  }};
946  template<int imm4, int imm32>
947  struct _v_rotate_right<true, imm4, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
948  {
949  return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16), imm4 *8),
950  _mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));
951  }};
952  template<int imm4>
953  struct _v_rotate_right<true, imm4, true, 31> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
954  { return v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};
955  template<int imm32>
956  struct _v_rotate_right<false, 0, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
957  { return v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};
958  template<>
959  struct _v_rotate_right<false, 0, false, 0> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64&) { return a; }};
960  template<int imm32>
961  struct _v_rotate_right<false, 0, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
962  { return v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};
963  template<>
964  struct _v_rotate_right<false, 0, true, 16> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) { return b; }};
965  template<>
966  struct _v_rotate_right<false, 0, true, 32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
967 }
968 template<int imm> inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b)
969 {
970  return imm >= 128 ? v_int8x64() :
971 #if CV_AVX_512VBMI
972  v_int8x64(_mm512_permutex2var_epi8(a.val,
973  _v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,
974  0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,
975  0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,
976  0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,
977  0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,
978  0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,
979  0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,
980  0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));
981 #else
982  _v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);
983 #endif
984 }
985 template<int imm>
986 inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b)
987 {
988  if (imm == 0) return a;
989  if (imm == 64) return b;
990  if (imm >= 128) return v_int8x64();
991 #if CV_AVX_512VBMI
992  return v_int8x64(_mm512_permutex2var_epi8(b.val,
993  _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
994  0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
995  0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
996  0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
997  0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
998  0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
999  0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
1000  0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
1001 #else
1002  return imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);
1003 #endif
1004 }
1005 template<int imm>
1006 inline v_int8x64 v_rotate_right(const v_int8x64& a)
1007 {
1008  if (imm == 0) return a;
1009  if (imm >= 64) return v_int8x64();
1010 #if CV_AVX_512VBMI
1011  return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
1012  _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
1013  0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
1014  0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
1015  0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
1016  0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
1017  0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
1018  0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
1019  0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
1020 #else
1021  return v_rotate_right<imm>(a, v512_setzero_s8());
1022 #endif
1023 }
1024 template<int imm>
1025 inline v_int8x64 v_rotate_left(const v_int8x64& a)
1026 {
1027  if (imm == 0) return a;
1028  if (imm >= 64) return v_int8x64();
1029 #if CV_AVX_512VBMI
1030  return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
1031  _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
1032  0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
1033  0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
1034  0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
1035  0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
1036  0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
1037  0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
1038  0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
1039 #else
1040  return v_rotate_right<64 - imm>(v512_setzero_s8(), a);
1041 #endif
1042 }
1043 
1044 #define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix) \
1045 template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1046 { return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \
1047 template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1048 { return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \
1049 template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1050 { return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); } \
1051 template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1052 { return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }
1053 
1054 #define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix) \
1055 template<int imm> \
1056 inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1057 { \
1058  enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \
1059  enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
1060  if (imm == 0) return a; \
1061  if (imm == _Tpvec::nlanes) return b; \
1062  if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
1063  return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \
1064 } \
1065 template<int imm> \
1066 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1067 { \
1068  enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \
1069  enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \
1070  if (imm == 0) return a; \
1071  if (imm == _Tpvec::nlanes) return b; \
1072  if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \
1073  return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
1074 } \
1075 template<int imm> \
1076 inline _Tpvec v_rotate_left(const _Tpvec& a) \
1077 { \
1078  if (imm == 0) return a; \
1079  if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
1080  return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
1081 } \
1082 template<int imm> \
1083 inline _Tpvec v_rotate_right(const _Tpvec& a) \
1084 { \
1085  if (imm == 0) return a; \
1086  if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \
1087  return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \
1088 }
1089 
1090 OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64, u8)
1091 OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32, u16)
1092 OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32, s16)
1093 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16, epi32)
1094 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16, epi32)
1095 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8, epi64)
1096 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8, epi64)
1097 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
1098 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8, pd)
1099 
1100 
1101 inline v_uint8x64 v_reverse(const v_uint8x64 &a)
1102 {
1103 #if CV_AVX_512VBMI
1104  static const __m512i perm = _mm512_set_epi32(
1105  0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1106  0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
1107  0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
1108  0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
1109  return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
1110 #else
1111  static const __m512i shuf = _mm512_set_epi32(
1112  0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1113  0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1114  0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
1115  0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
1116  static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
1117  __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
1118  return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
1119 #endif
1120 }
1121 
1122 inline v_int8x64 v_reverse(const v_int8x64 &a)
1123 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1124 
1125 inline v_uint16x32 v_reverse(const v_uint16x32 &a)
1126 {
1127 #if CV_AVX_512VBMI
1128  static const __m512i perm = _mm512_set_epi32(
1129  0x00000001, 0x00020003, 0x00040005, 0x00060007,
1130  0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
1131  0x00100011, 0x00120013, 0x00140015, 0x00160017,
1132  0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
1133  return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
1134 #else
1135  static const __m512i shuf = _mm512_set_epi32(
1136  0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1137  0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1138  0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
1139  0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
1140  static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
1141  __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
1142  return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
1143 #endif
1144 }
1145 
1146 inline v_int16x32 v_reverse(const v_int16x32 &a)
1147 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1148 
1149 inline v_uint32x16 v_reverse(const v_uint32x16 &a)
1150 {
1151  static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
1152  return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
1153 }
1154 
1155 inline v_int32x16 v_reverse(const v_int32x16 &a)
1156 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1157 
1158 inline v_float32x16 v_reverse(const v_float32x16 &a)
1159 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1160 
1161 inline v_uint64x8 v_reverse(const v_uint64x8 &a)
1162 {
1163  static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
1164  return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
1165 }
1166 
1167 inline v_int64x8 v_reverse(const v_int64x8 &a)
1168 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1169 
1170 inline v_float64x8 v_reverse(const v_float64x8 &a)
1171 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1172 
1174 
1176 #define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
1177 #define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop) \
1178  inline sctype v_reduce_##func(const _Tpvec& a) \
1179  { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1180  sctype CV_DECL_ALIGNED(64) idx[2]; \
1181  _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
1182  return scop(idx[0], idx[1]); }
1183 OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min)
1184 OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max)
1185 OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1186 OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, min, v_int64x8, min_epi64, min)
1187 OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, max, v_int64x8, max_epi64, max)
1188 OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, sum, v_int64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1189 
1190 #define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop) \
1191  inline double v_reduce_##func(const v_float64x8& a) \
1192  { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1193  double CV_DECL_ALIGNED(64) idx[2]; \
1194  _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
1195  return scop(idx[0], idx[1]); }
1196 OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min)
1197 OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max)
1198 OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
1199 
1200 #define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc) \
1201  inline sctype v_reduce_##func(const _Tpvec& a) \
1202  { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1203  __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1204  quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1205  quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1206  return (sctype)_mm_cvtsi128_si32(quarter); }
1207 OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32)
1208 OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32)
1209 OPENCV_HAL_IMPL_AVX512_REDUCE_16(int, min, v_int32x16, min_epi32)
1210 OPENCV_HAL_IMPL_AVX512_REDUCE_16(int, max, v_int32x16, max_epi32)
1211 
1212 #define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc) \
1213  inline float v_reduce_##func(const v_float32x16& a) \
1214  { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1215  __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
1216  quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2))); \
1217  quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1))); \
1218  return _mm_cvtss_f32(quarter); }
1219 OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)
1220 OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)
1221 
1222 inline float v_reduce_sum(const v_float32x16& a)
1223 {
1224  __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
1225  __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
1226  quarter = _mm_hadd_ps(quarter, quarter);
1227  return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
1228 }
1229 inline int v_reduce_sum(const v_int32x16& a)
1230 {
1231  __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
1232  __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1233  quarter = _mm_hadd_epi32(quarter, quarter);
1234  return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
1235 }
1236 inline uint v_reduce_sum(const v_uint32x16& a)
1237 { return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); }
1238 
1239 #define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc) \
1240  inline sctype v_reduce_##func(const _Tpvec& a) \
1241  { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1242  __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1243  quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1244  quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1245  quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \
1246  return (sctype)_mm_cvtsi128_si32(quarter); }
1247 OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16)
1248 OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16)
1249 OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, min, v_int16x32, min_epi16)
1250 OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, max, v_int16x32, max_epi16)
1251 
1252 inline int v_reduce_sum(const v_int16x32& a)
1253 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1254 inline uint v_reduce_sum(const v_uint16x32& a)
1255 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1256 
1257 #define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \
1258  inline sctype v_reduce_##func(const _Tpvec& a) \
1259  { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \
1260  __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
1261  quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \
1262  quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \
1263  quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \
1264  quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1)); \
1265  return (sctype)_mm_cvtsi128_si32(quarter); }
1266 OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8)
1267 OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8)
1268 OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64, min_epi8)
1269 OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64, max_epi8)
1270 
1271 #define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix) \
1272  inline sctype v_reduce_sum(const _Tpvec& a) \
1273  { __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)), \
1274  _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val))); \
1275  a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
1276  __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16)); \
1277  __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1)); \
1278  a4 = _mm_hadd_epi32(a4, a4); \
1279  return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
1280 OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)
1281 OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int, v_int8x64, epi8)
1282 
1283 inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b,
1284  const v_float32x16& c, const v_float32x16& d)
1285 {
1286  __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
1287  __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
1288  __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
1289  __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
1290  return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
1291 }
1292 
1293 inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b)
1294 {
1295  __m512i val = _mm512_sad_epu8(a.val, b.val);
1296  __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
1297  __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1298  return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1299 }
1300 inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
1301 {
1302  __m512i val = _mm512_set1_epi8(-128);
1303  val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
1304  __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
1305  __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
1306  return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
1307 }
1308 inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
1309 { return v_reduce_sum(v_add_wrap(a - b, b - a)); }
1310 inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
1311 { return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
1312 inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
1313 { return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
1314 inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
1315 { return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
1316 inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
1317 { return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
1318 inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
1319 { return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
1320 
1322 inline v_uint8x64 v_popcount(const v_int8x64& a)
1323 {
1324 #if CV_AVX_512BITALG
1325  return v_uint8x64(_mm512_popcnt_epi8(a.val));
1326 #elif CV_AVX_512VBMI
1327  __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
1328  5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
1329  5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
1330  4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
1331  __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
1332  6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
1333  6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
1334  5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
1335  return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
1336 #else
1337  __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
1338  __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);
1339 
1340  return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512( a.val, _popcnt_mask)),
1341  _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
1342 #endif
1343 }
1344 inline v_uint16x32 v_popcount(const v_int16x32& a)
1345 {
1346 #if CV_AVX_512BITALG
1347  return v_uint16x32(_mm512_popcnt_epi16(a.val));
1348 #elif CV_AVX_512VPOPCNTDQ
1349  __m512i zero = _mm512_setzero_si512();
1350  return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
1351  _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
1352 #else
1353  v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
1354  p += v_rotate_right<1>(p);
1355  return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
1356 #endif
1357 }
1358 inline v_uint32x16 v_popcount(const v_int32x16& a)
1359 {
1360 #if CV_AVX_512VPOPCNTDQ
1361  return v_uint32x16(_mm512_popcnt_epi32(a.val));
1362 #else
1363  v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
1364  p += v_rotate_right<1>(p);
1365  p += v_rotate_right<2>(p);
1366  return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
1367 #endif
1368 }
1369 inline v_uint64x8 v_popcount(const v_int64x8& a)
1370 {
1371 #if CV_AVX_512VPOPCNTDQ
1372  return v_uint64x8(_mm512_popcnt_epi64(a.val));
1373 #else
1374  return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
1375 #endif
1376 }
1377 
1378 
1379 inline v_uint8x64 v_popcount(const v_uint8x64& a) { return v_popcount(v_reinterpret_as_s8 (a)); }
1380 inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); }
1381 inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); }
1382 inline v_uint64x8 v_popcount(const v_uint64x8& a) { return v_popcount(v_reinterpret_as_s64(a)); }
1383 
1384 
1386 
1388 #if CV_FMA3
1389 #define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix) \
1390  inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1391  { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); } \
1392  inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1393  { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }
1394 #else
1395 #define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix) \
1396  inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1397  { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); } \
1398  inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1399  { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); }
1400 #endif
1401 
1402 #define OPENCV_HAL_IMPL_AVX512_MISC(_Tpvec, suffix) \
1403  inline _Tpvec v_sqrt(const _Tpvec& x) \
1404  { return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \
1405  inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1406  { return v_fma(a, a, b * b); } \
1407  inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1408  { return v_sqrt(v_fma(a, a, b * b)); }
1409 
1410 OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
1411 OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd)
1412 OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
1413 OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8, pd)
1414 
1415 inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
1416 { return a * b + c; }
1417 inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
1418 { return v_fma(a, b, c); }
1419 
1420 inline v_float32x16 v_invsqrt(const v_float32x16& x)
1421 {
1422 #if CV_AVX_512ER
1423  return v_float32x16(_mm512_rsqrt28_ps(x.val));
1424 #else
1425  v_float32x16 half = x * v512_setall_f32(0.5);
1426  v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(x.val));
1427  t *= v512_setall_f32(1.5) - ((t * t) * half);
1428  return t;
1429 #endif
1430 }
1431 
1432 inline v_float64x8 v_invsqrt(const v_float64x8& x)
1433 {
1434 #if CV_AVX_512ER
1435  return v_float64x8(_mm512_rsqrt28_pd(x.val));
1436 #else
1437  return v512_setall_f64(1.) / v_sqrt(x);
1438 // v_float64x8 half = x * v512_setall_f64(0.5);
1439 // v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
1440 // t *= v512_setall_f64(1.5) - ((t * t) * half);
1441 // t *= v512_setall_f64(1.5) - ((t * t) * half);
1442 // return t;
1443 #endif
1444 }
1445 
1447 #define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
1448  inline _Tpuvec v_abs(const _Tpvec& x) \
1449  { return _Tpuvec(_mm512_abs_##suffix(x.val)); }
1450 
1451 OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64, v_uint8x64, epi8)
1452 OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32, v_uint16x32, epi16)
1453 OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16, v_uint32x16, epi32)
1454 OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8, v_uint64x8, epi64)
1455 
1456 inline v_float32x16 v_abs(const v_float32x16& x)
1457 {
1458 #ifdef _mm512_abs_pd
1459  return v_float32x16(_mm512_abs_ps(x.val));
1460 #else
1461  return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),
1462  _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
1463  0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
1464 #endif
1465 }
1466 
1467 inline v_float64x8 v_abs(const v_float64x8& x)
1468 {
1469 #ifdef _mm512_abs_pd
1470  #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
1471  // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476
1472  return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));
1473  #else
1474  return v_float64x8(_mm512_abs_pd(x.val));
1475  #endif
1476 #else
1477  return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),
1478  _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
1479  0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
1480 #endif
1481 }
1482 
1484 inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
1485 { return v_add_wrap(a - b, b - a); }
1486 inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
1487 { return v_add_wrap(a - b, b - a); }
1488 inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
1489 { return v_max(a, b) - v_min(a, b); }
1490 
1491 inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
1492 {
1493  v_int8x64 d = v_sub_wrap(a, b);
1494  v_int8x64 m = a < b;
1495  return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1496 }
1497 
1498 inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
1499 { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1500 
1501 inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
1502 {
1503  v_int32x16 d = a - b;
1504  v_int32x16 m = a < b;
1505  return v_reinterpret_as_u32((d ^ m) - m);
1506 }
1507 
1508 inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
1509 { return v_abs(a - b); }
1510 
1511 inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
1512 { return v_abs(a - b); }
1513 
1515 inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
1516 {
1517  v_int8x64 d = a - b;
1518  v_int8x64 m = a < b;
1519  return (d ^ m) - m;
1520 }
1521 inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
1522 { return v_max(a, b) - v_min(a, b); }
1523 
1525 
1527 inline v_int32x16 v_round(const v_float32x16& a)
1528 { return v_int32x16(_mm512_cvtps_epi32(a.val)); }
1529 
1530 inline v_int32x16 v_round(const v_float64x8& a)
1531 { return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }
1532 
1533 inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b)
1534 { return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }
1535 
1536 inline v_int32x16 v_trunc(const v_float32x16& a)
1537 { return v_int32x16(_mm512_cvttps_epi32(a.val)); }
1538 
1539 inline v_int32x16 v_trunc(const v_float64x8& a)
1540 { return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }
1541 
1542 #if CVT_ROUND_MODES_IMPLEMENTED
1543 inline v_int32x16 v_floor(const v_float32x16& a)
1544 { return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }
1545 
1546 inline v_int32x16 v_floor(const v_float64x8& a)
1547 { return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }
1548 
1549 inline v_int32x16 v_ceil(const v_float32x16& a)
1550 { return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
1551 
1552 inline v_int32x16 v_ceil(const v_float64x8& a)
1553 { return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
1554 #else
1555 inline v_int32x16 v_floor(const v_float32x16& a)
1556 { return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }
1557 
1558 inline v_int32x16 v_floor(const v_float64x8& a)
1559 { return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }
1560 
1561 inline v_int32x16 v_ceil(const v_float32x16& a)
1562 { return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }
1563 
1564 inline v_int32x16 v_ceil(const v_float64x8& a)
1565 { return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
1566 #endif
1567 
1569 inline v_float32x16 v_cvt_f32(const v_int32x16& a)
1570 { return v_float32x16(_mm512_cvtepi32_ps(a.val)); }
1571 
1572 inline v_float32x16 v_cvt_f32(const v_float64x8& a)
1573 { return v_float32x16(_mm512_cvtpd_pslo(a.val)); }
1574 
1575 inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b)
1576 { return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }
1577 
1578 inline v_float64x8 v_cvt_f64(const v_int32x16& a)
1579 { return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }
1580 
1581 inline v_float64x8 v_cvt_f64_high(const v_int32x16& a)
1582 { return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }
1583 
1584 inline v_float64x8 v_cvt_f64(const v_float32x16& a)
1585 { return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }
1586 
1587 inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
1588 { return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
1589 
1590 // from (Mysticial and wim) https://stackoverflow.com/q/41144668
1591 inline v_float64x8 v_cvt_f64(const v_int64x8& v)
1592 {
1593 #if CV_AVX_512DQ
1594  return v_float64x8(_mm512_cvtepi64_pd(v.val));
1595 #else
1596  // constants encoded as floating-point
1597  __m512i magic_i_lo = _mm512_set1_epi64(0x4330000000000000); // 2^52
1598  __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63
1599  __m512i magic_i_all = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52
1600  __m512d magic_d_all = _mm512_castsi512_pd(magic_i_all);
1601 
1602  // Blend the 32 lowest significant bits of v with magic_int_lo
1603  __m512i v_lo = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
1604  // Extract the 32 most significant bits of v
1605  __m512i v_hi = _mm512_srli_epi64(v.val, 32);
1606  // Flip the msb of v_hi and blend with 0x45300000
1607  v_hi = _mm512_xor_si512(v_hi, magic_i_hi32);
1608  // Compute in double precision
1609  __m512d v_hi_dbl = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
1610  // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
1611  __m512d result = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
1612  return v_float64x8(result);
1613 #endif
1614 }
1615 
1617 
1618 inline v_int8x64 v512_lut(const schar* tab, const int* idx)
1619 {
1620  __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 1));
1621  __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
1622  __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1));
1623  __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1));
1624  return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
1625 }
1626 inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx)
1627 {
1628  __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 1));
1629  __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
1630  return v_int8x64(_v512_combine(p0, p1));
1631 }
1632 inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx)
1633 {
1634  return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1));
1635 }
1636 inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); }
1637 inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); }
1638 inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); }
1639 
1640 inline v_int16x32 v512_lut(const short* tab, const int* idx)
1641 {
1642  __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 2));
1643  __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2));
1644  return v_int16x32(_v512_combine(p0, p1));
1645 }
1646 inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx)
1647 {
1648  return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2));
1649 }
1650 inline v_int16x32 v512_lut_quads(const short* tab, const int* idx)
1651 {
1652 #if defined(__GNUC__)
1653  return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2));
1654 #else
1655  return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2));
1656 #endif
1657 }
1658 inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); }
1659 inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); }
1660 inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); }
1661 
1662 inline v_int32x16 v512_lut(const int* tab, const int* idx)
1663 {
1664  return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
1665 }
1666 inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx)
1667 {
1668 #if defined(__GNUC__)
1669  return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4));
1670 #else
1671  return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4));
1672 #endif
1673 }
1674 inline v_int32x16 v512_lut_quads(const int* tab, const int* idx)
1675 {
1676  return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
1677  _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
1678  _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
1679  _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
1680  _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
1681 }
1682 inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); }
1683 inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); }
1684 inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); }
1685 
1686 inline v_int64x8 v512_lut(const int64* tab, const int* idx)
1687 {
1688 #if defined(__GNUC__)
1689  return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8));
1690 #else
1691  return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8));
1692 #endif
1693 }
1694 inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx)
1695 {
1696  return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
1697  _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
1698  _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
1699  _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
1700  _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
1701 }
1702 inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); }
1703 inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); }
1704 
1705 inline v_float32x16 v512_lut(const float* tab, const int* idx)
1706 {
1707  return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
1708 }
1709 inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); }
1710 inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); }
1711 
1712 inline v_float64x8 v512_lut(const double* tab, const int* idx)
1713 {
1714  return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8));
1715 }
1716 inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx)
1717 {
1718  return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
1719  _mm_loadu_pd(tab + idx[0])),
1720  _mm_loadu_pd(tab + idx[1]), 1),
1721  _mm_loadu_pd(tab + idx[2]), 2),
1722  _mm_loadu_pd(tab + idx[3]), 3));
1723 }
1724 
1725 inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec)
1726 {
1727  return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
1728 }
1729 
1730 inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec)
1731 {
1732  return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
1733 }
1734 
1735 inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec)
1736 {
1737  return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
1738 }
1739 
1740 inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec)
1741 {
1742  return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
1743 }
1744 
1745 inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)
1746 {
1747  x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
1748  y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);
1749 }
1750 
1751 inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)
1752 {
1753  x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
1754  y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);
1755 }
1756 
1757 inline v_int8x64 v_interleave_pairs(const v_int8x64& vec)
1758 {
1759  return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
1760 }
1761 inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1762 inline v_int8x64 v_interleave_quads(const v_int8x64& vec)
1763 {
1764  return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
1765 }
1766 inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1767 
1768 inline v_int16x32 v_interleave_pairs(const v_int16x32& vec)
1769 {
1770  return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
1771 }
1772 inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1773 inline v_int16x32 v_interleave_quads(const v_int16x32& vec)
1774 {
1775  return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
1776 }
1777 inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1778 
1779 inline v_int32x16 v_interleave_pairs(const v_int32x16& vec)
1780 {
1781  return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
1782 }
1783 inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1784 inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1785 
1786 inline v_int8x64 v_pack_triplets(const v_int8x64& vec)
1787 {
1788  return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1789  0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
1790  _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
1791 }
1792 inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1793 
1794 inline v_int16x32 v_pack_triplets(const v_int16x32& vec)
1795 {
1796  return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
1797  0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
1798 }
1799 inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1800 
1801 inline v_int32x16 v_pack_triplets(const v_int32x16& vec)
1802 {
1803  return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1804  0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
1805 }
1806 inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
1807 inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
1808 {
1809  return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
1810  0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
1811 }
1812 
1814 
1816 
1817 // 16 >> 32
1818 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
1819 { return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
1820 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
1821 { return v_dotprod(a, b) + c; }
1822 
1823 // 32 >> 64
1824 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
1825 {
1826  __m512i even = _mm512_mul_epi32(a.val, b.val);
1827  __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
1828  return v_int64x8(_mm512_add_epi64(even, odd));
1829 }
1830 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
1831 { return v_dotprod(a, b) + c; }
1832 
1833 // 8 >> 32
1834 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
1835 {
1836  __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
1837  __m512i odd_a = _mm512_srli_epi16(a.val, 8);
1838 
1839  __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
1840  __m512i odd_b = _mm512_srli_epi16(b.val, 8);
1841 
1842  __m512i prod0 = _mm512_madd_epi16(even_a, even_b);
1843  __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
1844  return v_uint32x16(_mm512_add_epi32(prod0, prod1));
1845 }
1846 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
1847 { return v_dotprod_expand(a, b) + c; }
1848 
1849 inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
1850 {
1851  __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
1852  __m512i odd_a = _mm512_srai_epi16(a.val, 8);
1853 
1854  __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
1855  __m512i odd_b = _mm512_srai_epi16(b.val, 8);
1856 
1857  __m512i prod0 = _mm512_madd_epi16(even_a, even_b);
1858  __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b);
1859  return v_int32x16(_mm512_add_epi32(prod0, prod1));
1860 }
1861 inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
1862 { return v_dotprod_expand(a, b) + c; }
1863 
1864 // 16 >> 64
1865 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
1866 {
1867  __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
1868  __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
1869  __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
1870  __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
1871 
1872  __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
1873  __m512i p13 = _mm512_srli_epi64(mul0, 32);
1874  __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
1875  __m512i p57 = _mm512_srli_epi64(mul1, 32);
1876 
1877  __m512i p15_ = _mm512_add_epi64(p02, p13);
1878  __m512i p9d_ = _mm512_add_epi64(p46, p57);
1879 
1880  return v_uint64x8(_mm512_add_epi64(
1881  _mm512_unpacklo_epi64(p15_, p9d_),
1882  _mm512_unpackhi_epi64(p15_, p9d_)
1883  ));
1884 }
1885 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
1886 { return v_dotprod_expand(a, b) + c; }
1887 
1888 inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
1889 {
1890  __m512i prod = _mm512_madd_epi16(a.val, b.val);
1891  __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
1892  __m512i odd = _mm512_srai_epi64(prod, 32);
1893  return v_int64x8(_mm512_add_epi64(even, odd));
1894 }
1895 inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
1896 { return v_dotprod_expand(a, b) + c; }
1897 
1898 // 32 >> 64f
1899 inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
1900 { return v_cvt_f64(v_dotprod(a, b)); }
1901 inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
1902 { return v_dotprod_expand(a, b) + c; }
1903 
1905 
1906 // 16 >> 32
1907 inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)
1908 { return v_dotprod(a, b); }
1909 inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
1910 { return v_dotprod(a, b, c); }
1911 
1912 // 32 >> 64
1913 inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)
1914 { return v_dotprod(a, b); }
1915 inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
1916 { return v_dotprod(a, b, c); }
1917 
1918 // 8 >> 32
1919 inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)
1920 { return v_dotprod_expand(a, b); }
1921 inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
1922 { return v_dotprod_expand(a, b, c); }
1923 
1924 inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)
1925 { return v_dotprod_expand(a, b); }
1926 inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
1927 { return v_dotprod_expand(a, b, c); }
1928 
1929 // 16 >> 64
1930 inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)
1931 {
1932  __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
1933  __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
1934  __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi);
1935  __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi);
1936 
1937  __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
1938  __m512i p13 = _mm512_srli_epi64(mul0, 32);
1939  __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
1940  __m512i p57 = _mm512_srli_epi64(mul1, 32);
1941 
1942  __m512i p15_ = _mm512_add_epi64(p02, p13);
1943  __m512i p9d_ = _mm512_add_epi64(p46, p57);
1944  return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
1945 }
1946 inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
1947 { return v_dotprod_expand_fast(a, b) + c; }
1948 
1949 inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
1950 { return v_dotprod_expand(a, b); }
1951 inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
1952 { return v_dotprod_expand(a, b, c); }
1953 
1954 // 32 >> 64f
1955 inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
1956 { return v_dotprod_expand(a, b); }
1957 inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
1958 { return v_dotprod_expand(a, b) + c; }
1959 
1960 
1961 #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
1962  v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1963 
1964 inline v_float32x16 v_matmul(const v_float32x16& v,
1965  const v_float32x16& m0, const v_float32x16& m1,
1966  const v_float32x16& m2, const v_float32x16& m3)
1967 {
1968  v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
1969  v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
1970  v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
1971  v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
1972  return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
1973 }
1974 
1975 inline v_float32x16 v_matmuladd(const v_float32x16& v,
1976  const v_float32x16& m0, const v_float32x16& m1,
1977  const v_float32x16& m2, const v_float32x16& a)
1978 {
1979  v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
1980  v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
1981  v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
1982  return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
1983 }
1984 
1985 #define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1986  inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1987  const _Tpvec& a2, const _Tpvec& a3, \
1988  _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1989  { \
1990  __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val)); \
1991  __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val)); \
1992  __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val)); \
1993  __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val)); \
1994  b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1)); \
1995  b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1)); \
1996  b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3)); \
1997  b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3)); \
1998  }
1999 
2000 OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2001 OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2002 OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)
2003 
2004 
2006 /* Expand */
2007 #define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
2008  inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2009  { \
2010  b0.val = intrin(_v512_extract_low(a.val)); \
2011  b1.val = intrin(_v512_extract_high(a.val)); \
2012  } \
2013  inline _Tpwvec v_expand_low(const _Tpvec& a) \
2014  { return _Tpwvec(intrin(_v512_extract_low(a.val))); } \
2015  inline _Tpwvec v_expand_high(const _Tpvec& a) \
2016  { return _Tpwvec(intrin(_v512_extract_high(a.val))); } \
2017  inline _Tpwvec v512_load_expand(const _Tp* ptr) \
2018  { \
2019  __m256i a = _mm256_loadu_si256((const __m256i*)ptr); \
2020  return _Tpwvec(intrin(a)); \
2021  }
2022 
2023 OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64, v_uint16x32, uchar, _mm512_cvtepu8_epi16)
2024 OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64, v_int16x32, schar, _mm512_cvtepi8_epi16)
2025 OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort, _mm512_cvtepu16_epi32)
2026 OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32, v_int32x16, short, _mm512_cvtepi16_epi32)
2027 OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8, unsigned, _mm512_cvtepu32_epi64)
2028 OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16, v_int64x8, int, _mm512_cvtepi32_epi64)
2029 
2030 #define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
2031  inline _Tpvec v512_load_expand_q(const _Tp* ptr) \
2032  { \
2033  __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
2034  return _Tpvec(intrin(a)); \
2035  }
2036 
2037 OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)
2038 OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16, schar, _mm512_cvtepi8_epi32)
2039 
2040 /* pack */
2041 // 16
2042 inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
2043 { return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
2044 
2045 inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
2046 {
2047  const __m512i t = _mm512_set1_epi16(255);
2048  return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
2049 }
2050 
2051 inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
2052 {
2053  return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
2054 }
2055 
2056 inline void v_pack_store(schar* ptr, const v_int16x32& a)
2057 { v_store_low(ptr, v_pack(a, a)); }
2058 
2059 inline void v_pack_store(uchar* ptr, const v_uint16x32& a)
2060 {
2061  const __m512i m = _mm512_set1_epi16(255);
2062  _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
2063 }
2064 
2065 inline void v_pack_u_store(uchar* ptr, const v_int16x32& a)
2066 { v_store_low(ptr, v_pack_u(a, a)); }
2067 
2068 template<int n> inline
2069 v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
2070 {
2071  // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
2072  v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
2073  return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
2074  v_reinterpret_as_s16((b + delta) >> n));
2075 }
2076 
2077 template<int n> inline
2078 void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
2079 {
2080  v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
2081  v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
2082 }
2083 
2084 template<int n> inline
2085 v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
2086 {
2087  v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2088  return v_pack_u((a + delta) >> n, (b + delta) >> n);
2089 }
2090 
2091 template<int n> inline
2092 void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
2093 {
2094  v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2095  v_pack_u_store(ptr, (a + delta) >> n);
2096 }
2097 
2098 template<int n> inline
2099 v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
2100 {
2101  v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2102  return v_pack((a + delta) >> n, (b + delta) >> n);
2103 }
2104 
2105 template<int n> inline
2106 void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
2107 {
2108  v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
2109  v_pack_store(ptr, (a + delta) >> n);
2110 }
2111 
2112 // 32
2113 inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
2114 { return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
2115 
2116 inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
2117 {
2118  const __m512i m = _mm512_set1_epi32(65535);
2119  return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
2120 }
2121 
2122 inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
2123 { return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
2124 
2125 inline void v_pack_store(short* ptr, const v_int32x16& a)
2126 { v_store_low(ptr, v_pack(a, a)); }
2127 
2128 inline void v_pack_store(ushort* ptr, const v_uint32x16& a)
2129 {
2130  const __m512i m = _mm512_set1_epi32(65535);
2131  _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
2132 }
2133 
2134 inline void v_pack_u_store(ushort* ptr, const v_int32x16& a)
2135 { v_store_low(ptr, v_pack_u(a, a)); }
2136 
2137 
2138 template<int n> inline
2139 v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
2140 {
2141  v_uint32x16 delta = v512_setall_u32(1 << (n-1));
2142  return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
2143  v_reinterpret_as_s32((b + delta) >> n));
2144 }
2145 
2146 template<int n> inline
2147 void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
2148 {
2149  v_uint32x16 delta = v512_setall_u32(1 << (n-1));
2150  v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
2151 }
2152 
2153 template<int n> inline
2154 v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
2155 {
2156  v_int32x16 delta = v512_setall_s32(1 << (n-1));
2157  return v_pack_u((a + delta) >> n, (b + delta) >> n);
2158 }
2159 
2160 template<int n> inline
2161 void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
2162 {
2163  v_int32x16 delta = v512_setall_s32(1 << (n-1));
2164  v_pack_u_store(ptr, (a + delta) >> n);
2165 }
2166 
2167 template<int n> inline
2168 v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
2169 {
2170  v_int32x16 delta = v512_setall_s32(1 << (n-1));
2171  return v_pack((a + delta) >> n, (b + delta) >> n);
2172 }
2173 
2174 template<int n> inline
2175 void v_rshr_pack_store(short* ptr, const v_int32x16& a)
2176 {
2177  v_int32x16 delta = v512_setall_s32(1 << (n-1));
2178  v_pack_store(ptr, (a + delta) >> n);
2179 }
2180 
2181 // 64
2182 // Non-saturating pack
2183 inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b)
2184 { return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }
2185 
2186 inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b)
2187 { return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2188 
2189 inline void v_pack_store(unsigned* ptr, const v_uint64x8& a)
2190 { _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }
2191 
2192 inline void v_pack_store(int* ptr, const v_int64x8& b)
2193 { v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
2194 
2195 template<int n> inline
2196 v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
2197 {
2198  v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
2199  return v_pack((a + delta) >> n, (b + delta) >> n);
2200 }
2201 
2202 template<int n> inline
2203 void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
2204 {
2205  v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
2206  v_pack_store(ptr, (a + delta) >> n);
2207 }
2208 
2209 template<int n> inline
2210 v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
2211 {
2212  v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
2213  return v_pack((a + delta) >> n, (b + delta) >> n);
2214 }
2215 
2216 template<int n> inline
2217 void v_rshr_pack_store(int* ptr, const v_int64x8& a)
2218 {
2219  v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
2220  v_pack_store(ptr, (a + delta) >> n);
2221 }
2222 
2223 // pack boolean
2224 inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
2225 { return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
2226 
2227 inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
2228  const v_uint32x16& c, const v_uint32x16& d)
2229 {
2230  __m512i ab = _mm512_packs_epi32(a.val, b.val);
2231  __m512i cd = _mm512_packs_epi32(c.val, d.val);
2232 
2233  return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
2234 }
2235 
2236 inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
2237  const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f,
2238  const v_uint64x8& g, const v_uint64x8& h)
2239 {
2240  __m512i ab = _mm512_packs_epi32(a.val, b.val);
2241  __m512i cd = _mm512_packs_epi32(c.val, d.val);
2242  __m512i ef = _mm512_packs_epi32(e.val, f.val);
2243  __m512i gh = _mm512_packs_epi32(g.val, h.val);
2244 
2245  __m512i abcd = _mm512_packs_epi32(ab, cd);
2246  __m512i efgh = _mm512_packs_epi32(ef, gh);
2247 
2248  return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
2249  27, 19, 11, 3, 26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0), _mm512_packs_epi16(abcd, efgh)));
2250 }
2251 
2252 /* Recombine */
2253 // its up there with load and store operations
2254 
2255 /* Extract */
2256 #define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec) \
2257  template<int s> \
2258  inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2259  { return v_rotate_right<s>(a, b); }
2260 
2261 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
2262 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
2263 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
2264 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
2265 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
2266 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
2267 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
2268 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
2269 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
2270 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)
2271 
2272 #define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
2273 template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }
2274 
2275 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)
2276 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)
2277 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)
2278 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short)
2279 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)
2280 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int)
2281 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)
2282 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)
2283 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float)
2284 OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double)
2285 
2286 template<int i>
2287 inline v_uint32x16 v_broadcast_element(v_uint32x16 a)
2288 {
2289  static const __m512i perm = _mm512_set1_epi32((char)i);
2290  return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
2291 }
2292 
2293 template<int i>
2294 inline v_int32x16 v_broadcast_element(const v_int32x16 &a)
2295 { return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2296 
2297 template<int i>
2298 inline v_float32x16 v_broadcast_element(const v_float32x16 &a)
2299 { return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2300 
2301 
2303 
2304 inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b )
2305 {
2306  __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2307  __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2308 #if CV_AVX_512VBMI
2309  __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
2310  94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
2311  62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2312  30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2313  __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
2314  95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
2315  63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2316  31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2317  a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
2318  b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
2319 #else
2320  __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
2321  __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
2322  __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
2323  __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2324  __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2325  a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
2326  b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
2327 #endif
2328 }
2329 
2330 inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b )
2331 {
2332  __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2333  __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2334  __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2335  30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2336  __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2337  31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2338  a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
2339  b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
2340 }
2341 
2342 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b )
2343 {
2344  __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2345  __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2346  __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2347  __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2348  a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
2349  b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
2350 }
2351 
2352 inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b )
2353 {
2354  __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
2355  __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
2356  __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2357  __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2358  a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
2359  b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
2360 }
2361 
2362 inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c )
2363 {
2364  __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2365  __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2366  __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
2367 
2368 #if CV_AVX_512VBMI2
2369  __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81,
2370  78, 75, 72, 69, 66, 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33,
2371  30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 62, 59, 56, 53, 50,
2372  47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2);
2373  __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
2374  __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
2375  __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
2376  _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80,
2377  77, 74, 71, 68, 65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
2378  94, 91, 88, 85, 82, 79, 76, 73, 70, 67, 64, 61, 58, 55, 52, 49,
2379  46, 43, 40, 37, 34, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1), bgr2);
2380  a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
2381  b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
2382  c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
2383 #elif CV_AVX_512VBMI
2384  __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
2385  __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
2386  __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
2387  a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80,
2388  77, 74, 71, 68, 65, 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48,
2389  46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24,
2390  23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0), bgr2));
2391  b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40,
2392  39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, 23, 21, 20, 18, 17,
2393  15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 126, 123, 120, 117, 114,
2394  111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81, 78, 75, 72, 69, 66), bgr0));
2395  c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18,
2396  15, 12, 9, 6, 3, 0, 125, 122, 119, 116, 113, 110, 107, 104, 101, 98,
2397  95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 62, 59, 56, 53, 50,
2398  47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2), bgr1));
2399 #else
2400  __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
2401  45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2402  __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
2403  __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
2404  __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
2405 
2406  __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
2407  __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
2408  14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
2409  __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
2410  a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
2411  c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
2412  b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
2413 #endif
2414 }
2415 
2416 inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c )
2417 {
2418  __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2419  __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2420  __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2421 
2422  __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
2423  45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2424  __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
2425  __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
2426  __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
2427 
2428  a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
2429  b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
2430  14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
2431  c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
2432 }
2433 
2434 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )
2435 {
2436  __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2437  __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2438  __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2439 
2440  __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
2441  __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
2442  __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
2443  __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);
2444 
2445  a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
2446  b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
2447  c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
2448 }
2449 
2450 inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c )
2451 {
2452  __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
2453  __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
2454  __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2455 
2456  __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
2457  __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
2458  __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
2459  __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);
2460 
2461  a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
2462  c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
2463  b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
2464 }
2465 
2466 inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d )
2467 {
2468  __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2469  __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2470  __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
2471  __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192));
2472 
2473 #if CV_AVX_512VBMI
2474  __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
2475  94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
2476  62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2477  30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2478  __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
2479  95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
2480  63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2481  31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2482 
2483  __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
2484  __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
2485  __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
2486  __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
2487 
2488  a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
2489  c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
2490  b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
2491  d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
2492 #else
2493  __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
2494  __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
2495  __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
2496  __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
2497  __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);
2498 
2499  __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2500  __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2501 
2502  __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
2503  __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
2504  __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
2505  __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
2506 
2507  a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
2508  c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
2509  b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
2510  d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
2511 #endif
2512 }
2513 
2514 inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d )
2515 {
2516  __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2517  __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2518  __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
2519  __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96));
2520 
2521  __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
2522  30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2523  __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
2524  31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2525 
2526  __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
2527  __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
2528  __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
2529  __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);
2530 
2531  a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
2532  c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
2533  b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
2534  d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
2535 }
2536 
2537 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )
2538 {
2539  __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2540  __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2541  __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
2542  __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48));
2543 
2544  __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
2545  __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
2546 
2547  __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
2548  __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
2549  __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
2550  __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);
2551 
2552  a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
2553  c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
2554  b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
2555  d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
2556 }
2557 
2558 inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d )
2559 {
2560  __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
2561  __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
2562  __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
2563  __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24));
2564 
2565  __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
2566  __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
2567 
2568  __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
2569  __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
2570  __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
2571  __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);
2572 
2573  a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
2574  c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
2575  b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
2576  d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
2577 }
2578 
2580 
2581 inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y,
2583 {
2584  v_uint8x64 low, high;
2585  v_zip(x, y, low, high);
2586  if( mode == hal::STORE_ALIGNED_NOCACHE )
2587  {
2588  _mm512_stream_si512((__m512i*)ptr, low.val);
2589  _mm512_stream_si512((__m512i*)(ptr + 64), high.val);
2590  }
2591  else if( mode == hal::STORE_ALIGNED )
2592  {
2593  _mm512_store_si512((__m512i*)ptr, low.val);
2594  _mm512_store_si512((__m512i*)(ptr + 64), high.val);
2595  }
2596  else
2597  {
2598  _mm512_storeu_si512((__m512i*)ptr, low.val);
2599  _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
2600  }
2601 }
2602 
2603 inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y,
2605 {
2606  v_uint16x32 low, high;
2607  v_zip(x, y, low, high);
2608  if( mode == hal::STORE_ALIGNED_NOCACHE )
2609  {
2610  _mm512_stream_si512((__m512i*)ptr, low.val);
2611  _mm512_stream_si512((__m512i*)(ptr + 32), high.val);
2612  }
2613  else if( mode == hal::STORE_ALIGNED )
2614  {
2615  _mm512_store_si512((__m512i*)ptr, low.val);
2616  _mm512_store_si512((__m512i*)(ptr + 32), high.val);
2617  }
2618  else
2619  {
2620  _mm512_storeu_si512((__m512i*)ptr, low.val);
2621  _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
2622  }
2623 }
2624 
2625 inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y,
2627 {
2628  v_uint32x16 low, high;
2629  v_zip(x, y, low, high);
2630  if( mode == hal::STORE_ALIGNED_NOCACHE )
2631  {
2632  _mm512_stream_si512((__m512i*)ptr, low.val);
2633  _mm512_stream_si512((__m512i*)(ptr + 16), high.val);
2634  }
2635  else if( mode == hal::STORE_ALIGNED )
2636  {
2637  _mm512_store_si512((__m512i*)ptr, low.val);
2638  _mm512_store_si512((__m512i*)(ptr + 16), high.val);
2639  }
2640  else
2641  {
2642  _mm512_storeu_si512((__m512i*)ptr, low.val);
2643  _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
2644  }
2645 }
2646 
2647 inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y,
2649 {
2650  v_uint64x8 low, high;
2651  v_zip(x, y, low, high);
2652  if( mode == hal::STORE_ALIGNED_NOCACHE )
2653  {
2654  _mm512_stream_si512((__m512i*)ptr, low.val);
2655  _mm512_stream_si512((__m512i*)(ptr + 8), high.val);
2656  }
2657  else if( mode == hal::STORE_ALIGNED )
2658  {
2659  _mm512_store_si512((__m512i*)ptr, low.val);
2660  _mm512_store_si512((__m512i*)(ptr + 8), high.val);
2661  }
2662  else
2663  {
2664  _mm512_storeu_si512((__m512i*)ptr, low.val);
2665  _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
2666  }
2667 }
2668 
2669 inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c,
2671 {
2672 #if CV_AVX_512VBMI
2673  __m512i mask0 = _v512_set_epu8(127, 84, 20, 126, 83, 19, 125, 82, 18, 124, 81, 17, 123, 80, 16, 122,
2674  79, 15, 121, 78, 14, 120, 77, 13, 119, 76, 12, 118, 75, 11, 117, 74,
2675  10, 116, 73, 9, 115, 72, 8, 114, 71, 7, 113, 70, 6, 112, 69, 5,
2676  111, 68, 4, 110, 67, 3, 109, 66, 2, 108, 65, 1, 107, 64, 0, 106);
2677  __m512i mask1 = _v512_set_epu8( 21, 42, 105, 20, 41, 104, 19, 40, 103, 18, 39, 102, 17, 38, 101, 16,
2678  37, 100, 15, 36, 99, 14, 35, 98, 13, 34, 97, 12, 33, 96, 11, 32,
2679  95, 10, 31, 94, 9, 30, 93, 8, 29, 92, 7, 28, 91, 6, 27, 90,
2680  5, 26, 89, 4, 25, 88, 3, 24, 87, 2, 23, 86, 1, 22, 85, 0);
2681  __m512i mask2 = _v512_set_epu8(106, 127, 63, 105, 126, 62, 104, 125, 61, 103, 124, 60, 102, 123, 59, 101,
2682  122, 58, 100, 121, 57, 99, 120, 56, 98, 119, 55, 97, 118, 54, 96, 117,
2683  53, 95, 116, 52, 94, 115, 51, 93, 114, 50, 92, 113, 49, 91, 112, 48,
2684  90, 111, 47, 89, 110, 46, 88, 109, 45, 87, 108, 44, 86, 107, 43, 85);
2685  __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);
2686  __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);
2687  __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);
2688 
2689  __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
2690  __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
2691  __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
2692 #else
2693  __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
2694  __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);
2695  __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);
2696  __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);
2697 
2698  __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37,
2699  5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0);
2700  __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
2701  47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
2702  __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
2703  26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
2704  __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
2705  __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
2706  __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);
2707 
2708  __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
2709  __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
2710  __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
2711 #endif
2712 
2713  if( mode == hal::STORE_ALIGNED_NOCACHE )
2714  {
2715  _mm512_stream_si512((__m512i*)ptr, bgr0);
2716  _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
2717  _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
2718  }
2719  else if( mode == hal::STORE_ALIGNED )
2720  {
2721  _mm512_store_si512((__m512i*)ptr, bgr0);
2722  _mm512_store_si512((__m512i*)(ptr + 64), bgr1);
2723  _mm512_store_si512((__m512i*)(ptr + 128), bgr2);
2724  }
2725  else
2726  {
2727  _mm512_storeu_si512((__m512i*)ptr, bgr0);
2728  _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
2729  _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
2730  }
2731 }
2732 
2733 inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c,
2735 {
2736  __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37,
2737  5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0);
2738  __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
2739  47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
2740  __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
2741  26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
2742  __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);
2743  __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);
2744  __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);
2745 
2746  __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
2747  __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
2748  __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
2749 
2750  if( mode == hal::STORE_ALIGNED_NOCACHE )
2751  {
2752  _mm512_stream_si512((__m512i*)ptr, bgr0);
2753  _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
2754  _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
2755  }
2756  else if( mode == hal::STORE_ALIGNED )
2757  {
2758  _mm512_store_si512((__m512i*)ptr, bgr0);
2759  _mm512_store_si512((__m512i*)(ptr + 32), bgr1);
2760  _mm512_store_si512((__m512i*)(ptr + 64), bgr2);
2761  }
2762  else
2763  {
2764  _mm512_storeu_si512((__m512i*)ptr, bgr0);
2765  _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
2766  _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
2767  }
2768 }
2769 
2770 inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c,
2772 {
2773  __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
2774  __m512i mask1 = _v512_set_epu32(31, 10, 25, 30, 9, 24, 29, 8, 23, 28, 7, 22, 27, 6, 21, 26);
2775  __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);
2776  __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);
2777 
2778  __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);
2779  __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
2780  __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);
2781 
2782  if( mode == hal::STORE_ALIGNED_NOCACHE )
2783  {
2784  _mm512_stream_si512((__m512i*)ptr, bgr0);
2785  _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
2786  _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
2787  }
2788  else if( mode == hal::STORE_ALIGNED )
2789  {
2790  _mm512_store_si512((__m512i*)ptr, bgr0);
2791  _mm512_store_si512((__m512i*)(ptr + 16), bgr1);
2792  _mm512_store_si512((__m512i*)(ptr + 32), bgr2);
2793  }
2794  else
2795  {
2796  _mm512_storeu_si512((__m512i*)ptr, bgr0);
2797  _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
2798  _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
2799  }
2800 }
2801 
2802 inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
2804 {
2805  __m512i mask0 = _v512_set_epu64( 5, 12, 7, 4, 11, 6, 3, 10);
2806  __m512i mask1 = _v512_set_epu64(15, 7, 4, 14, 6, 3, 13, 5);
2807  __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);
2808  __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);
2809 
2810  __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);
2811  __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
2812  __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);
2813 
2814  if( mode == hal::STORE_ALIGNED_NOCACHE )
2815  {
2816  _mm512_stream_si512((__m512i*)ptr, bgr0);
2817  _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
2818  _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
2819  }
2820  else if( mode == hal::STORE_ALIGNED )
2821  {
2822  _mm512_store_si512((__m512i*)ptr, bgr0);
2823  _mm512_store_si512((__m512i*)(ptr + 8), bgr1);
2824  _mm512_store_si512((__m512i*)(ptr + 16), bgr2);
2825  }
2826  else
2827  {
2828  _mm512_storeu_si512((__m512i*)ptr, bgr0);
2829  _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
2830  _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
2831  }
2832 }
2833 
2834 inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b,
2835  const v_uint8x64& c, const v_uint8x64& d,
2837 {
2838  v_uint8x64 br01, br23, ga01, ga23;
2839  v_zip(a, c, br01, br23);
2840  v_zip(b, d, ga01, ga23);
2841  v_uint8x64 bgra0, bgra1, bgra2, bgra3;
2842  v_zip(br01, ga01, bgra0, bgra1);
2843  v_zip(br23, ga23, bgra2, bgra3);
2844 
2845  if( mode == hal::STORE_ALIGNED_NOCACHE )
2846  {
2847  _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2848  _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
2849  _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
2850  _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
2851  }
2852  else if( mode == hal::STORE_ALIGNED )
2853  {
2854  _mm512_store_si512((__m512i*)ptr, bgra0.val);
2855  _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
2856  _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
2857  _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
2858  }
2859  else
2860  {
2861  _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2862  _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
2863  _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
2864  _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
2865  }
2866 }
2867 
2868 inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b,
2869  const v_uint16x32& c, const v_uint16x32& d,
2871 {
2872  v_uint16x32 br01, br23, ga01, ga23;
2873  v_zip(a, c, br01, br23);
2874  v_zip(b, d, ga01, ga23);
2875  v_uint16x32 bgra0, bgra1, bgra2, bgra3;
2876  v_zip(br01, ga01, bgra0, bgra1);
2877  v_zip(br23, ga23, bgra2, bgra3);
2878 
2879  if( mode == hal::STORE_ALIGNED_NOCACHE )
2880  {
2881  _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2882  _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
2883  _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
2884  _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
2885  }
2886  else if( mode == hal::STORE_ALIGNED )
2887  {
2888  _mm512_store_si512((__m512i*)ptr, bgra0.val);
2889  _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
2890  _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
2891  _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
2892  }
2893  else
2894  {
2895  _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2896  _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
2897  _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
2898  _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
2899  }
2900 }
2901 
2902 inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b,
2903  const v_uint32x16& c, const v_uint32x16& d,
2905 {
2906  v_uint32x16 br01, br23, ga01, ga23;
2907  v_zip(a, c, br01, br23);
2908  v_zip(b, d, ga01, ga23);
2909  v_uint32x16 bgra0, bgra1, bgra2, bgra3;
2910  v_zip(br01, ga01, bgra0, bgra1);
2911  v_zip(br23, ga23, bgra2, bgra3);
2912 
2913  if( mode == hal::STORE_ALIGNED_NOCACHE )
2914  {
2915  _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2916  _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
2917  _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
2918  _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
2919  }
2920  else if( mode == hal::STORE_ALIGNED )
2921  {
2922  _mm512_store_si512((__m512i*)ptr, bgra0.val);
2923  _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
2924  _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
2925  _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
2926  }
2927  else
2928  {
2929  _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2930  _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
2931  _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
2932  _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
2933  }
2934 }
2935 
2936 inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b,
2937  const v_uint64x8& c, const v_uint64x8& d,
2939 {
2940  v_uint64x8 br01, br23, ga01, ga23;
2941  v_zip(a, c, br01, br23);
2942  v_zip(b, d, ga01, ga23);
2943  v_uint64x8 bgra0, bgra1, bgra2, bgra3;
2944  v_zip(br01, ga01, bgra0, bgra1);
2945  v_zip(br23, ga23, bgra2, bgra3);
2946 
2947  if( mode == hal::STORE_ALIGNED_NOCACHE )
2948  {
2949  _mm512_stream_si512((__m512i*)ptr, bgra0.val);
2950  _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
2951  _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
2952  _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
2953  }
2954  else if( mode == hal::STORE_ALIGNED )
2955  {
2956  _mm512_store_si512((__m512i*)ptr, bgra0.val);
2957  _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
2958  _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
2959  _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
2960  }
2961  else
2962  {
2963  _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
2964  _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
2965  _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
2966  _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
2967  }
2968 }
2969 
2970 #define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2971 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2972 { \
2973  _Tpvec1 a1, b1; \
2974  v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2975  a0 = v_reinterpret_as_##suffix0(a1); \
2976  b0 = v_reinterpret_as_##suffix0(b1); \
2977 } \
2978 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2979 { \
2980  _Tpvec1 a1, b1, c1; \
2981  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2982  a0 = v_reinterpret_as_##suffix0(a1); \
2983  b0 = v_reinterpret_as_##suffix0(b1); \
2984  c0 = v_reinterpret_as_##suffix0(c1); \
2985 } \
2986 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2987 { \
2988  _Tpvec1 a1, b1, c1, d1; \
2989  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2990  a0 = v_reinterpret_as_##suffix0(a1); \
2991  b0 = v_reinterpret_as_##suffix0(b1); \
2992  c0 = v_reinterpret_as_##suffix0(c1); \
2993  d0 = v_reinterpret_as_##suffix0(d1); \
2994 } \
2995 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2996  hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2997 { \
2998  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2999  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3000  v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
3001 } \
3002 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
3003  hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3004 { \
3005  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3006  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3007  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3008  v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
3009 } \
3010 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
3011  const _Tpvec0& c0, const _Tpvec0& d0, \
3012  hal::StoreMode mode=hal::STORE_UNALIGNED ) \
3013 { \
3014  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
3015  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
3016  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
3017  _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
3018  v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3019 }
3020 
3021 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)
3022 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16)
3023 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32)
3024 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32)
3025 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)
3026 OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64)
3027 
3028 
3031 inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }
3032 inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3033 inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3034 inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3035 
3036 inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); }
3037 inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); }
3038 inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
3039 inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
3040 inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
3041 inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
3042 
3044 inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3045 inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }
3046 inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3047 inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3048 inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3049 inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3050 inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
3051 inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
3052 
3053 inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
3054 inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
3055 inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
3056 inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
3057 inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); }
3058 inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); }
3059 inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
3060 inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
3061 inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); }
3062 inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); }
3063 inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
3064 inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
3065 
3066 inline int v_scan_forward(const v_int8x64& a)
3067 {
3068  int64 mask = _mm512_movepi8_mask(a.val);
3069  int mask32 = (int)mask;
3070  return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
3071 }
3072 inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }
3073 inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
3074 inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
3075 inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
3076 inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
3077 inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
3078 inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
3079 inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
3080 inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
3081 
3082 inline void v512_cleanup() { _mm256_zeroall(); }
3083 
3084 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3085 
3087 
3088 } // cv::
3089 
3090 #endif // OPENCV_HAL_INTRIN_AVX_HPP
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition: intrin_cpp.hpp:1515
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition: intrin_cpp.hpp:1554
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2216
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition: intrin_cpp.hpp:1496
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
OutputArray sum
Definition: imgproc.hpp:2882
StoreMode
Definition: intrin.hpp:100
@ STORE_ALIGNED_NOCACHE
Definition: intrin.hpp:103
@ STORE_ALIGNED
Definition: intrin.hpp:102
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition: dualquaternion.inl.hpp:274