EstervQrCode 1.1.1
Library for qr code manipulation
intrin_neon.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44 
45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
46 #define OPENCV_HAL_INTRIN_NEON_HPP
47 
48 #include <algorithm>
49 #include "opencv2/core/utility.hpp"
50 
51 namespace cv
52 {
53 
55 
56 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
57 
58 #define CV_SIMD128 1
59 #if defined(__aarch64__) || defined(_M_ARM64)
60 #define CV_SIMD128_64F 1
61 #else
62 #define CV_SIMD128_64F 0
63 #endif
64 
65 // The following macro checks if the code is being compiled for the
66 // AArch64 execution state of Armv8, to enable the 128-bit
67 // intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
68 // the Arm C Language Extension (ACLE) specifications [1] to check the
69 // availability of 128-bit intrinsics, and it is supporrted by clang
70 // and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
71 // Visual Studio [2] .
72 //
73 // [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
74 // [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
75 #if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
76 #define CV_NEON_AARCH64 1
77 #else
78 #define CV_NEON_AARCH64 0
79 #endif
80 
81 
83 
84 #if CV_SIMD128_64F
85 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
86  inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
87  { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
88 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
89  inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
90  { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
91 #else
92 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
93  inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
94  { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
95 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
96  inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
97  { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
98 #endif
99 
100 #if CV_SIMD128_64F
101 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
102  template <typename T> static inline \
103  _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
104  template <typename T> static inline \
105  float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
106 #else
107 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
108 #endif
109 
110 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
111  OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
112  OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
113  OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
114 
115 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
116  OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
117 
118 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
119  OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
120 
121 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8, u8)
122 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16, int8x8, s8)
123 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
124 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16)
125 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
126 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32)
127 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
128 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
129 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
130 #if CV_SIMD128_64F
131 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
132 #endif
133 
135 template<typename T> struct VTraits {
136  static inline int vlanes() { return T::nlanes; }
137  enum { max_nlanes = T::nlanes, nlanes = T::nlanes };
138  using lane_type = typename T::lane_type;
139 };
140 
141 template<typename T>
142 inline typename VTraits<T>::lane_type v_get0(const T& v) \
143 { \
144  return v.get0(); \
145 }
147 
148 struct v_uint8x16
149 {
150  v_uint8x16() {}
151  explicit v_uint8x16(uint8x16_t v) : val(v) {}
152  v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
153  uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
154  {
155  uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
156  val = vld1q_u8(v);
157  }
158  uint8x16_t val;
159 
160 private:
161  friend struct VTraits<v_uint8x16>;
162  enum { nlanes = 16 };
163  typedef uchar lane_type;
164 
165  friend typename VTraits<v_uint8x16>::lane_type v_get0<v_uint8x16>(const v_uint8x16& v);
166  uchar get0() const
167  {
168  return vgetq_lane_u8(val, 0);
169  }
170 };
171 
172 struct v_int8x16
173 {
174  v_int8x16() {}
175  explicit v_int8x16(int8x16_t v) : val(v) {}
176  v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
177  schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
178  {
179  schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
180  val = vld1q_s8(v);
181  }
182  int8x16_t val;
183 
184 private:
185  friend struct VTraits<v_int8x16>;
186  enum { nlanes = 16 };
187  typedef schar lane_type;
188 
189  friend typename VTraits<v_int8x16>::lane_type v_get0<v_int8x16>(const v_int8x16& v);
190  schar get0() const
191  {
192  return vgetq_lane_s8(val, 0);
193  }
194 };
195 
196 struct v_uint16x8
197 {
198  v_uint16x8() {}
199  explicit v_uint16x8(uint16x8_t v) : val(v) {}
200  v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
201  {
202  ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
203  val = vld1q_u16(v);
204  }
205  uint16x8_t val;
206 
207 private:
208  friend struct VTraits<v_uint16x8>;
209  enum { nlanes = 8 };
210  typedef ushort lane_type;
211 
212  friend typename VTraits<v_uint16x8>::lane_type v_get0<v_uint16x8>(const v_uint16x8& v);
213  ushort get0() const
214  {
215  return vgetq_lane_u16(val, 0);
216  }
217 };
218 
219 struct v_int16x8
220 {
221  v_int16x8() {}
222  explicit v_int16x8(int16x8_t v) : val(v) {}
223  v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
224  {
225  short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
226  val = vld1q_s16(v);
227  }
228  int16x8_t val;
229 
230 private:
231  friend struct VTraits<v_int16x8>;
232  enum { nlanes = 8 };
233  typedef short lane_type;
234 
235  friend typename VTraits<v_int16x8>::lane_type v_get0<v_int16x8>(const v_int16x8& v);
236  short get0() const
237  {
238  return vgetq_lane_s16(val, 0);
239  }
240 };
241 
242 struct v_uint32x4
243 {
244  v_uint32x4() {}
245  explicit v_uint32x4(uint32x4_t v) : val(v) {}
246  v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
247  {
248  unsigned v[] = {v0, v1, v2, v3};
249  val = vld1q_u32(v);
250  }
251  uint32x4_t val;
252 
253 private:
254  friend struct VTraits<v_uint32x4>;
255  enum { nlanes = 4 };
256  typedef unsigned lane_type;
257 
258  friend typename VTraits<v_uint32x4>::lane_type v_get0<v_uint32x4>(const v_uint32x4& v);
259  unsigned get0() const
260  {
261  return vgetq_lane_u32(val, 0);
262  }
263 };
264 
265 struct v_int32x4
266 {
267  v_int32x4() {}
268  explicit v_int32x4(int32x4_t v) : val(v) {}
269  v_int32x4(int v0, int v1, int v2, int v3)
270  {
271  int v[] = {v0, v1, v2, v3};
272  val = vld1q_s32(v);
273  }
274  int32x4_t val;
275 
276 private:
277  friend struct VTraits<v_int32x4>;
278  enum { nlanes = 4 };
279  typedef int lane_type;
280 
281  friend typename VTraits<v_int32x4>::lane_type v_get0<v_int32x4>(const v_int32x4& v);
282  int get0() const
283  {
284  return vgetq_lane_s32(val, 0);
285  }
286 };
287 
288 struct v_float32x4
289 {
290  v_float32x4() {}
291  explicit v_float32x4(float32x4_t v) : val(v) {}
292  v_float32x4(float v0, float v1, float v2, float v3)
293  {
294  float v[] = {v0, v1, v2, v3};
295  val = vld1q_f32(v);
296  }
297  float32x4_t val;
298 
299 private:
300  friend struct VTraits<v_float32x4>;
301  enum { nlanes = 4 };
302  typedef float lane_type;
303 
304  friend typename VTraits<v_float32x4>::lane_type v_get0<v_float32x4>(const v_float32x4& v);
305  float get0() const
306  {
307  return vgetq_lane_f32(val, 0);
308  }
309 };
310 
311 struct v_uint64x2
312 {
313  v_uint64x2() {}
314  explicit v_uint64x2(uint64x2_t v) : val(v) {}
315  v_uint64x2(uint64 v0, uint64 v1)
316  {
317  uint64 v[] = {v0, v1};
318  val = vld1q_u64(v);
319  }
320  uint64x2_t val;
321 private:
322  friend struct VTraits<v_uint64x2>;
323  enum { nlanes = 2 };
324  typedef uint64 lane_type;
325 
326  friend typename VTraits<v_uint64x2>::lane_type v_get0<v_uint64x2>(const v_uint64x2& v);
327  uint64 get0() const
328  {
329  return vgetq_lane_u64(val, 0);
330  }
331 };
332 
333 struct v_int64x2
334 {
335  v_int64x2() {}
336  explicit v_int64x2(int64x2_t v) : val(v) {}
337  v_int64x2(int64 v0, int64 v1)
338  {
339  int64 v[] = {v0, v1};
340  val = vld1q_s64(v);
341  }
342  int64x2_t val;
343 
344 private:
345  friend struct VTraits<v_int64x2>;
346  enum { nlanes = 2 };
347  typedef int64 lane_type;
348 
349  friend typename VTraits<v_int64x2>::lane_type v_get0<v_int64x2>(const v_int64x2& v);
350  int64 get0() const
351  {
352  return vgetq_lane_s64(val, 0);
353  }
354 };
355 
356 #if CV_SIMD128_64F
357 struct v_float64x2
358 {
359  v_float64x2() {}
360  explicit v_float64x2(float64x2_t v) : val(v) {}
361  v_float64x2(double v0, double v1)
362  {
363  double v[] = {v0, v1};
364  val = vld1q_f64(v);
365  }
366 
367  float64x2_t val;
368 private:
369  friend struct VTraits<v_float64x2>;
370  enum { nlanes = 2 };
371  typedef double lane_type;
372 
373  friend typename VTraits<v_float64x2>::lane_type v_get0<v_float64x2>(const v_float64x2& v);
374  double get0() const
375  {
376  return vgetq_lane_f64(val, 0);
377  }
378 };
379 #endif
380 
381 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
382 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
383 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
384 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
385 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
386 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
387 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
388 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
389 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
390 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
391 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
392 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
393 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
394 
395 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
396 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
397 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
398 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
399 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
400 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
401 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
402 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
403 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
404 #if CV_SIMD128_64F
405 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
406 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
407 OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
408 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
409 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
410 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
411 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
412 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
413 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
414 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
415 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
416 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
417 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
418 #endif
419 
420 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
421 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
422 { \
423  hreg a1 = mov(a.val), b1 = mov(b.val); \
424  return _Tpvec(vcombine_##suffix(a1, b1)); \
425 } \
426 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
427 { \
428  hreg a1 = mov(a.val); \
429  vst1_##suffix(ptr, a1); \
430 } \
431 template<int n> inline \
432 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
433 { \
434  hreg a1 = rshr(a.val, n); \
435  hreg b1 = rshr(b.val, n); \
436  return _Tpvec(vcombine_##suffix(a1, b1)); \
437 } \
438 template<int n> inline \
439 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
440 { \
441  hreg a1 = rshr(a.val, n); \
442  vst1_##suffix(ptr, a1); \
443 }
444 
445 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
446 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
447 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
448 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
449 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
450 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
451 
452 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
453 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
454 
455 // pack boolean
456 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
457 {
458  uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
459  return v_uint8x16(ab);
460 }
461 
462 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
463  const v_uint32x4& c, const v_uint32x4& d)
464 {
465  uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
466  uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
467  return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
468 }
469 
470 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
471  const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
472  const v_uint64x2& g, const v_uint64x2& h)
473 {
474  uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
475  uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
476  uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
477  uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
478 
479  uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
480  uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
481  return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
482 }
483 
484 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
485  const v_float32x4& m1, const v_float32x4& m2,
486  const v_float32x4& m3)
487 {
488  float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
489  float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
490  res = vmlaq_lane_f32(res, m1.val, vl, 1);
491  res = vmlaq_lane_f32(res, m2.val, vh, 0);
492  res = vmlaq_lane_f32(res, m3.val, vh, 1);
493  return v_float32x4(res);
494 }
495 
496 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
497  const v_float32x4& m1, const v_float32x4& m2,
498  const v_float32x4& a)
499 {
500  float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
501  float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
502  res = vmlaq_lane_f32(res, m1.val, vl, 1);
503  res = vmlaq_lane_f32(res, m2.val, vh, 0);
504  res = vaddq_f32(res, a.val);
505  return v_float32x4(res);
506 }
507 
508 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
509 inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \
510 { \
511  return _Tpvec(intrin(a.val, b.val)); \
512 }
513 
514 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint8x16, vqaddq_u8)
515 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint8x16, vqsubq_u8)
516 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int8x16, vqaddq_s8)
517 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int8x16, vqsubq_s8)
518 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint16x8, vqaddq_u16)
519 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint16x8, vqsubq_u16)
520 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int16x8, vqaddq_s16)
521 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int16x8, vqsubq_s16)
522 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int32x4, vaddq_s32)
523 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int32x4, vsubq_s32)
524 OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32)
525 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32)
526 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32)
527 OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32)
528 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32)
529 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32)
530 OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32)
531 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int64x2, vaddq_s64)
532 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int64x2, vsubq_s64)
533 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint64x2, vaddq_u64)
534 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint64x2, vsubq_u64)
535 #if CV_SIMD128_64F
536 OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float32x4, vdivq_f32)
537 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float64x2, vaddq_f64)
538 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float64x2, vsubq_f64)
539 OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float64x2, vmulq_f64)
540 OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float64x2, vdivq_f64)
541 #else
542 inline v_float32x4 v_div (const v_float32x4& a, const v_float32x4& b)
543 {
544  float32x4_t reciprocal = vrecpeq_f32(b.val);
545  reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
546  reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
547  return v_float32x4(vmulq_f32(a.val, reciprocal));
548 }
549 #endif
550 
551 // saturating multiply 8-bit, 16-bit
552 #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
553  inline _Tpvec v_mul (const _Tpvec& a, const _Tpvec& b) \
554  { \
555  _Tpwvec c, d; \
556  v_mul_expand(a, b, c, d); \
557  return v_pack(c, d); \
558  }
559 
560 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8)
561 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
562 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8, v_int32x4)
563 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
564 
565 // Multiply and expand
566 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
567  v_int16x8& c, v_int16x8& d)
568 {
569  c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
570 #if CV_NEON_AARCH64
571  d.val = vmull_high_s8(a.val, b.val);
572 #else // #if CV_NEON_AARCH64
573  d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
574 #endif // #if CV_NEON_AARCH64
575 }
576 
577 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
578  v_uint16x8& c, v_uint16x8& d)
579 {
580  c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
581 #if CV_NEON_AARCH64
582  d.val = vmull_high_u8(a.val, b.val);
583 #else // #if CV_NEON_AARCH64
584  d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
585 #endif // #if CV_NEON_AARCH64
586 }
587 
588 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
589  v_int32x4& c, v_int32x4& d)
590 {
591  c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
592 #if CV_NEON_AARCH64
593  d.val = vmull_high_s16(a.val, b.val);
594 #else // #if CV_NEON_AARCH64
595  d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
596 #endif // #if CV_NEON_AARCH64
597 }
598 
599 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
600  v_uint32x4& c, v_uint32x4& d)
601 {
602  c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
603 #if CV_NEON_AARCH64
604  d.val = vmull_high_u16(a.val, b.val);
605 #else // #if CV_NEON_AARCH64
606  d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
607 #endif // #if CV_NEON_AARCH64
608 }
609 
610 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
611  v_uint64x2& c, v_uint64x2& d)
612 {
613  c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
614 #if CV_NEON_AARCH64
615  d.val = vmull_high_u32(a.val, b.val);
616 #else // #if CV_NEON_AARCH64
617  d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
618 #endif // #if CV_NEON_AARCH64
619 }
620 
621 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
622 {
623 #if CV_NEON_AARCH64
624  int32x4_t c = vmull_high_s16(a.val, b.val);
625 #else // #if CV_NEON_AARCH64
626  int32x4_t c = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
627 #endif // #if CV_NEON_AARCH64
628  return v_int16x8(vcombine_s16(
629  vshrn_n_s32(vmull_s16( vget_low_s16(a.val), vget_low_s16(b.val)), 16),
630  vshrn_n_s32(c, 16)
631  ));
632 }
633 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
634 {
635 #if CV_NEON_AARCH64
636  uint32x4_t c = vmull_high_u16(a.val, b.val);
637 #else // #if CV_NEON_AARCH64
638  uint32x4_t c = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
639 #endif // #if CV_NEON_AARCH64
640  return v_uint16x8(vcombine_u16(
641  vshrn_n_u32(vmull_u16( vget_low_u16(a.val), vget_low_u16(b.val)), 16),
642  vshrn_n_u32(c, 16)
643  ));
644 }
645 
647 
648 // 16 >> 32
649 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
650 {
651  int16x8_t uzp1, uzp2;
652  _v128_unzip(a.val, b.val, uzp1, uzp2);
653  int16x4_t a0 = vget_low_s16(uzp1);
654  int16x4_t b0 = vget_high_s16(uzp1);
655  int16x4_t a1 = vget_low_s16(uzp2);
656  int16x4_t b1 = vget_high_s16(uzp2);
657  int32x4_t p = vmull_s16(a0, b0);
658  return v_int32x4(vmlal_s16(p, a1, b1));
659 }
660 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
661 {
662  int16x8_t uzp1, uzp2;
663  _v128_unzip(a.val, b.val, uzp1, uzp2);
664  int16x4_t a0 = vget_low_s16(uzp1);
665  int16x4_t b0 = vget_high_s16(uzp1);
666  int16x4_t a1 = vget_low_s16(uzp2);
667  int16x4_t b1 = vget_high_s16(uzp2);
668  int32x4_t p = vmlal_s16(c.val, a0, b0);
669  return v_int32x4(vmlal_s16(p, a1, b1));
670 }
671 
672 // 32 >> 64
673 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
674 {
675  int32x4_t uzp1, uzp2;
676  _v128_unzip(a.val, b.val, uzp1, uzp2);
677  int32x2_t a0 = vget_low_s32(uzp1);
678  int32x2_t b0 = vget_high_s32(uzp1);
679  int32x2_t a1 = vget_low_s32(uzp2);
680  int32x2_t b1 = vget_high_s32(uzp2);
681  int64x2_t p = vmull_s32(a0, b0);
682  return v_int64x2(vmlal_s32(p, a1, b1));
683 }
684 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
685 {
686  int32x4_t uzp1, uzp2;
687  _v128_unzip(a.val, b.val, uzp1, uzp2);
688  int32x2_t a0 = vget_low_s32(uzp1);
689  int32x2_t b0 = vget_high_s32(uzp1);
690  int32x2_t a1 = vget_low_s32(uzp2);
691  int32x2_t b1 = vget_high_s32(uzp2);
692  int64x2_t p = vmlal_s32(c.val, a0, b0);
693  return v_int64x2(vmlal_s32(p, a1, b1));
694 }
695 
696 // 8 >> 32
697 #ifdef CV_NEON_DOT
698 #define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \
699 inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b) \
700 { \
701  return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\
702 } \
703 inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
704 { \
705  return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \
706 }
707 
708 OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_uint32x4, v_uint8x16, u32)
709 OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_int32x4, v_int8x16, s32)
710 #else
711 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
712 {
713  const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
714  const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
715  const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
716  const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
717 
718  uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
719  vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
720  uint16x8_t odd = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
721  vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
722 
723  uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
724  vreinterpretq_u32_u16(vbslq_u16(mask32, odd, zero32)));
725  uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
726  vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
727  return v_uint32x4(vaddq_u32(s0, s1));
728 }
729 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
730  const v_uint32x4& c)
731 {
732  return v_add(v_dotprod_expand(a, b), c);
733 }
734 
735 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
736 {
737  int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
738  int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
739  int16x8_t uzp1, uzp2;
740  _v128_unzip(p0, p1, uzp1, uzp2);
741  int16x8_t sum = vaddq_s16(uzp1, uzp2);
742  int16x4_t uzpl1, uzpl2;
743  _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
744  return v_int32x4(vaddl_s16(uzpl1, uzpl2));
745 }
746 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
747  const v_int32x4& c)
748 {
749  return v_add(v_dotprod_expand(a, b), c);
750 }
751 #endif
752 // 16 >> 64
753 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
754 {
755  const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
756  const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
757 
758  uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
759  vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
760  uint32x4_t odd = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
761  vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
762  uint32x4_t uzp1, uzp2;
763  _v128_unzip(even, odd, uzp1, uzp2);
764  uint64x2_t s0 = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
765  uint64x2_t s1 = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
766  return v_uint64x2(vaddq_u64(s0, s1));
767 }
768 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
769 { return v_add(v_dotprod_expand(a, b), c); }
770 
771 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
772 {
773  int32x4_t p0 = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
774  int32x4_t p1 = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
775 
776  int32x4_t uzp1, uzp2;
777  _v128_unzip(p0, p1, uzp1, uzp2);
778  int32x4_t sum = vaddq_s32(uzp1, uzp2);
779 
780  int32x2_t uzpl1, uzpl2;
781  _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
782  return v_int64x2(vaddl_s32(uzpl1, uzpl2));
783 }
784 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
785  const v_int64x2& c)
786 { return v_add(v_dotprod_expand(a, b), c); }
787 
788 // 32 >> 64f
789 #if CV_SIMD128_64F
790 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
791 { return v_cvt_f64(v_dotprod(a, b)); }
792 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
793  const v_float64x2& c)
794 { return v_add(v_dotprod_expand(a, b), c); }
795 #endif
796 
798 
799 // 16 >> 32
800 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
801 {
802 #if CV_NEON_AARCH64
803  int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
804  return v_int32x4(vmlal_high_s16(p, a.val, b.val));
805 #else
806  int16x4_t a0 = vget_low_s16(a.val);
807  int16x4_t a1 = vget_high_s16(a.val);
808  int16x4_t b0 = vget_low_s16(b.val);
809  int16x4_t b1 = vget_high_s16(b.val);
810  int32x4_t p = vmull_s16(a0, b0);
811  return v_int32x4(vmlal_s16(p, a1, b1));
812 #endif
813 }
814 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
815 {
816 #if CV_NEON_AARCH64
817  int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
818  return v_int32x4(vmlal_high_s16(p, a.val, b.val));
819 #else
820  int16x4_t a0 = vget_low_s16(a.val);
821  int16x4_t a1 = vget_high_s16(a.val);
822  int16x4_t b0 = vget_low_s16(b.val);
823  int16x4_t b1 = vget_high_s16(b.val);
824  int32x4_t p = vmlal_s16(c.val, a0, b0);
825  return v_int32x4(vmlal_s16(p, a1, b1));
826 #endif
827 }
828 
829 // 32 >> 64
830 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
831 {
832 #if CV_NEON_AARCH64
833  int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
834  return v_int64x2(vmlal_high_s32(p, a.val, b.val));
835 #else
836  int32x2_t a0 = vget_low_s32(a.val);
837  int32x2_t a1 = vget_high_s32(a.val);
838  int32x2_t b0 = vget_low_s32(b.val);
839  int32x2_t b1 = vget_high_s32(b.val);
840  int64x2_t p = vmull_s32(a0, b0);
841  return v_int64x2(vmlal_s32(p, a1, b1));
842 #endif
843 }
844 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
845 {
846 #if CV_NEON_AARCH64
847  int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
848  return v_int64x2(vmlal_high_s32(p, a.val, b.val));
849 #else
850  int32x2_t a0 = vget_low_s32(a.val);
851  int32x2_t a1 = vget_high_s32(a.val);
852  int32x2_t b0 = vget_low_s32(b.val);
853  int32x2_t b1 = vget_high_s32(b.val);
854  int64x2_t p = vmlal_s32(c.val, a0, b0);
855  return v_int64x2(vmlal_s32(p, a1, b1));
856 #endif
857 }
858 
859 // 8 >> 32
860 #ifdef CV_NEON_DOT
861 #define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \
862 inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b) \
863 { \
864  return v_dotprod_expand(a, b); \
865 } \
866 inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
867 { \
868  return v_dotprod_expand(a, b, c); \
869 }
870 
871 OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_uint32x4, v_uint8x16, u32)
872 OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_int32x4, v_int8x16, s32)
873 #else
874 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
875 {
876  uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
877  uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
878  uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
879  uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
880  return v_uint32x4(vaddq_u32(s0, s1));
881 }
882 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
883 {
884  return v_add(v_dotprod_expand_fast(a, b), c);
885 }
886 
887 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
888 {
889  int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
890  prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
891  return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
892 }
893 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
894 {
895  return v_add(v_dotprod_expand_fast(a, b), c);
896 }
897 #endif
898 
899 // 16 >> 64
900 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
901 {
902  uint32x4_t p0 = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
903  uint32x4_t p1 = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
904  uint64x2_t s0 = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
905  uint64x2_t s1 = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
906  return v_uint64x2(vaddq_u64(s0, s1));
907 }
908 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
909 { return v_add(v_dotprod_expand_fast(a, b), c); }
910 
911 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
912 {
913  int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
914  prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
915  return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
916 }
917 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
918 { return v_add(v_dotprod_expand_fast(a, b), c); }
919 
920 // 32 >> 64f
921 #if CV_SIMD128_64F
922 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
923 { return v_cvt_f64(v_dotprod_fast(a, b)); }
924 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
925 { return v_add(v_dotprod_expand_fast(a, b), c); }
926 #endif
927 
928 
929 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
930  OPENCV_HAL_IMPL_NEON_BIN_OP(v_and, _Tpvec, vandq_##suffix) \
931  OPENCV_HAL_IMPL_NEON_BIN_OP(v_or, _Tpvec, vorrq_##suffix) \
932  OPENCV_HAL_IMPL_NEON_BIN_OP(v_xor, _Tpvec, veorq_##suffix) \
933  inline _Tpvec v_not (const _Tpvec& a) \
934  { \
935  return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
936  }
937 
938 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
939 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
940 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
941 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
942 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
943 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
944 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
945 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
946 
947 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
948 inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \
949 { \
950  return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
951 }
952 
953 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_and, vandq_s32)
954 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_or, vorrq_s32)
955 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_xor, veorq_s32)
956 
957 inline v_float32x4 v_not (const v_float32x4& a)
958 {
959  return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
960 }
961 
962 #if CV_SIMD128_64F
963 inline v_float32x4 v_sqrt(const v_float32x4& x)
964 {
965  return v_float32x4(vsqrtq_f32(x.val));
966 }
967 
968 inline v_float32x4 v_invsqrt(const v_float32x4& x)
969 {
970  v_float32x4 one = v_setall_f32(1.0f);
971  return v_div(one, v_sqrt(x));
972 }
973 #else
974 inline v_float32x4 v_sqrt(const v_float32x4& x)
975 {
976  float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
977  float32x4_t e = vrsqrteq_f32(x1);
978  e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
979  e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
980  return v_float32x4(vmulq_f32(x.val, e));
981 }
982 
983 inline v_float32x4 v_invsqrt(const v_float32x4& x)
984 {
985  float32x4_t e = vrsqrteq_f32(x.val);
986  e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
987  e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
988  return v_float32x4(e);
989 }
990 #endif
991 
992 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
993 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
994 
995 OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
996 OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
997 OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
998 
999 inline v_float32x4 v_abs(v_float32x4 x)
1000 { return v_float32x4(vabsq_f32(x.val)); }
1001 
1002 #if CV_SIMD128_64F
1003 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
1004 inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \
1005 { \
1006  return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
1007 }
1008 
1009 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_and, vandq_s64)
1010 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_or, vorrq_s64)
1011 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_xor, veorq_s64)
1012 
1013 inline v_float64x2 v_not (const v_float64x2& a)
1014 {
1015  return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
1016 }
1017 
1018 inline v_float64x2 v_sqrt(const v_float64x2& x)
1019 {
1020  return v_float64x2(vsqrtq_f64(x.val));
1021 }
1022 
1023 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1024 {
1025  v_float64x2 one = v_setall_f64(1.0f);
1026  return v_div(one, v_sqrt(x));
1027 }
1028 
1029 inline v_float64x2 v_abs(v_float64x2 x)
1030 { return v_float64x2(vabsq_f64(x.val)); }
1031 #endif
1032 
1033 // TODO: exp, log, sin, cos
1034 
1035 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
1036 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1037 { \
1038  return _Tpvec(intrin(a.val, b.val)); \
1039 }
1040 
1041 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
1042 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
1043 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
1044 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
1045 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
1046 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
1047 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
1048 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
1049 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
1050 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
1051 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
1052 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
1053 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
1054 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
1055 #if CV_SIMD128_64F
1056 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
1057 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
1058 #endif
1059 
1060 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
1061 inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
1062 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
1063 inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
1064 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
1065 inline _Tpvec v_lt (const _Tpvec& a, const _Tpvec& b) \
1066 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
1067 inline _Tpvec v_gt (const _Tpvec& a, const _Tpvec& b) \
1068 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
1069 inline _Tpvec v_le (const _Tpvec& a, const _Tpvec& b) \
1070 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
1071 inline _Tpvec v_ge (const _Tpvec& a, const _Tpvec& b) \
1072 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
1073 
1074 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
1075 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
1076 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
1077 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
1078 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
1079 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
1080 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
1081 #if defined(__aarch64__) || defined(_M_ARM64)
1082 static inline uint64x2_t vmvnq_u64(uint64x2_t a)
1083 {
1084  uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
1085  return veorq_u64(a, vx);
1086 }
1087 //OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
1088 //OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
1089 static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
1090 { return v_uint64x2(vceqq_u64(a.val, b.val)); }
1091 static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
1092 { return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
1093 static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
1094 { return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
1095 static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
1096 { return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
1097 #else
1098 static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
1099 {
1100  uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
1101  uint32x4_t swapped = vrev64q_u32(cmp);
1102  return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
1103 }
1104 static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
1105 {
1106  uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
1107  uint32x4_t swapped = vrev64q_u32(cmp);
1108  uint64x2_t v_eq = vreinterpretq_u64_u32(vandq_u32(cmp, swapped));
1109  uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
1110  return v_uint64x2(veorq_u64(v_eq, vx));
1111 }
1112 static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
1113 {
1114  return v_reinterpret_as_s64(v_eq(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
1115 }
1116 static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
1117 {
1118  return v_reinterpret_as_s64(v_ne(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
1119 }
1120 #endif
1121 #if CV_SIMD128_64F
1122 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
1123 #endif
1124 
1125 inline v_float32x4 v_not_nan(const v_float32x4& a)
1126 { return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
1127 #if CV_SIMD128_64F
1128 inline v_float64x2 v_not_nan(const v_float64x2& a)
1129 { return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
1130 #endif
1131 
1132 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
1133 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
1134 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
1135 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
1136 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
1137 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
1138 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
1139 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
1140 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
1141 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
1142 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
1143 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
1144 
1145 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
1146 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
1147 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
1148 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
1149 #if CV_SIMD128_64F
1150 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
1151 #endif
1152 
1154 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1155 { return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
1156 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1157 { return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
1158 
1159 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
1160 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
1161 { \
1162  return _Tpvec2(cast(intrin(a.val, b.val))); \
1163 }
1164 
1165 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
1166 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
1167 OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
1168 
1169 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1170 {
1171  v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1172  return v_sqrt(x);
1173 }
1174 
1175 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1176 {
1177  return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1178 }
1179 
1180 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1181 {
1182 #if CV_SIMD128_64F
1183  // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
1184  // also adds FMA support both for single- and double-precision floating-point vectors
1185  return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
1186 #else
1187  return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
1188 #endif
1189 }
1190 
1191 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1192 {
1193  return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
1194 }
1195 
1196 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1197 {
1198  return v_fma(a, b, c);
1199 }
1200 
1201 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1202 {
1203  return v_fma(a, b, c);
1204 }
1205 
1206 #if CV_SIMD128_64F
1207 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1208 {
1209  v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1210  return v_sqrt(x);
1211 }
1212 
1213 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1214 {
1215  return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1216 }
1217 
1218 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1219 {
1220  return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
1221 }
1222 
1223 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1224 {
1225  return v_fma(a, b, c);
1226 }
1227 #endif
1228 
1229 // trade efficiency for convenience
1230 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
1231 inline _Tpvec v_shl (const _Tpvec& a, int n) \
1232 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
1233 inline _Tpvec v_shr (const _Tpvec& a, int n) \
1234 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
1235 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1236 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
1237 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1238 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
1239 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1240 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
1241 
1242 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
1243 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
1244 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
1245 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
1246 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
1247 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
1248 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
1249 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
1250 
1251 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
1252 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1253 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
1254 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1255 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, VTraits<_Tpvec>::nlanes - n)); } \
1256 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1257 { return a; } \
1258 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1259 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
1260 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1261 { return _Tpvec(vextq_##suffix(b.val, a.val, VTraits<_Tpvec>::nlanes - n)); } \
1262 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1263 { CV_UNUSED(b); return a; }
1264 
1265 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
1266 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
1267 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
1268 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
1269 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
1270 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
1271 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
1272 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
1273 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
1274 #if CV_SIMD128_64F
1275 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
1276 #endif
1277 
1278 #if defined(__clang__) && defined(__aarch64__)
1279 // avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
1280 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1281 inline _Tpvec v_load_low(const _Tp* ptr) \
1282 { \
1283 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1284 uint64 v = *(unaligned_uint64*)ptr; \
1285 return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
1286 }
1287 #else
1288 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1289 inline _Tpvec v_load_low(const _Tp* ptr) \
1290 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
1291 #endif
1292 
1293 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
1294 inline _Tpvec v_load(const _Tp* ptr) \
1295 { return _Tpvec(vld1q_##suffix(ptr)); } \
1296 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1297 { return _Tpvec(vld1q_##suffix(ptr)); } \
1298 OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1299 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1300 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
1301 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1302 { vst1q_##suffix(ptr, a.val); } \
1303 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1304 { vst1q_##suffix(ptr, a.val); } \
1305 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1306 { vst1q_##suffix(ptr, a.val); } \
1307 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1308 { vst1q_##suffix(ptr, a.val); } \
1309 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1310 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
1311 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1312 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
1313 
1314 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
1315 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
1316 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
1317 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
1318 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
1319 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
1320 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
1321 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
1322 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
1323 #if CV_SIMD128_64F
1324 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
1325 #endif
1326 
1327 inline unsigned v_reduce_sum(const v_uint8x16& a)
1328 {
1329 #if CV_NEON_AARCH64
1330  uint16_t t0 = vaddlvq_u8(a.val);
1331  return t0;
1332 #else // #if CV_NEON_AARCH64
1333  uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
1334  uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1335  return vget_lane_u32(vpadd_u32(t1, t1), 0);
1336 #endif // #if CV_NEON_AARCH64
1337 }
1338 inline int v_reduce_sum(const v_int8x16& a)
1339 {
1340 #if CV_NEON_AARCH64
1341  int16_t t0 = vaddlvq_s8(a.val);
1342  return t0;
1343 #else // #if CV_NEON_AARCH64
1344  int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
1345  int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1346  return vget_lane_s32(vpadd_s32(t1, t1), 0);
1347 #endif // #if CV_NEON_AARCH64
1348 }
1349 inline unsigned v_reduce_sum(const v_uint16x8& a)
1350 {
1351 #if CV_NEON_AARCH64
1352  uint32_t t0 = vaddlvq_u16(a.val);
1353  return t0;
1354 #else // #if CV_NEON_AARCH64
1355  uint32x4_t t0 = vpaddlq_u16(a.val);
1356  uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1357  return vget_lane_u32(vpadd_u32(t1, t1), 0);
1358 #endif // #if CV_NEON_AARCH64
1359 }
1360 inline int v_reduce_sum(const v_int16x8& a)
1361 {
1362 #if CV_NEON_AARCH64
1363  int32_t t0 = vaddlvq_s16(a.val);
1364  return t0;
1365 #else // #if CV_NEON_AARCH64
1366  int32x4_t t0 = vpaddlq_s16(a.val);
1367  int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1368  return vget_lane_s32(vpadd_s32(t1, t1), 0);
1369 #endif // #if CV_NEON_AARCH64
1370 }
1371 
1372 #if CV_NEON_AARCH64
1373 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1374 inline scalartype v_reduce_##func(const _Tpvec& a) \
1375 { \
1376  return v##vectorfunc##vq_##suffix(a.val); \
1377 }
1378 #else // #if CV_NEON_AARCH64
1379 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1380 inline scalartype v_reduce_##func(const _Tpvec& a) \
1381 { \
1382  _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1383  a0 = vp##vectorfunc##_##suffix(a0, a0); \
1384  a0 = vp##vectorfunc##_##suffix(a0, a0); \
1385  return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1386 }
1387 #endif // #if CV_NEON_AARCH64
1388 
1389 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
1390 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
1391 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
1392 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
1393 
1394 #if CV_NEON_AARCH64
1395 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1396 inline scalartype v_reduce_##func(const _Tpvec& a) \
1397 { \
1398  return v##vectorfunc##vq_##suffix(a.val); \
1399 }
1400 #else // #if CV_NEON_AARCH64
1401 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1402 inline scalartype v_reduce_##func(const _Tpvec& a) \
1403 { \
1404  _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1405  a0 = vp##vectorfunc##_##suffix(a0, a0); \
1406  return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1407 }
1408 #endif // #if CV_NEON_AARCH64
1409 
1410 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
1411 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
1412 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
1413 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
1414 
1415 #if CV_NEON_AARCH64
1416 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1417 inline scalartype v_reduce_##func(const _Tpvec& a) \
1418 { \
1419  return v##vectorfunc##vq_##suffix(a.val); \
1420 }
1421 #else // #if CV_NEON_AARCH64
1422 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1423 inline scalartype v_reduce_##func(const _Tpvec& a) \
1424 { \
1425  _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1426  return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
1427 }
1428 #endif // #if CV_NEON_AARCH64
1429 
1430 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
1431 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
1432 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
1433 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
1434 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
1435 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
1436 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
1437 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
1438 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
1439 
1440 inline uint64 v_reduce_sum(const v_uint64x2& a)
1441 {
1442 #if CV_NEON_AARCH64
1443  return vaddvq_u64(a.val);
1444 #else // #if CV_NEON_AARCH64
1445  return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
1446 #endif // #if CV_NEON_AARCH64
1447 }
1448 inline int64 v_reduce_sum(const v_int64x2& a)
1449 {
1450 #if CV_NEON_AARCH64
1451  return vaddvq_s64(a.val);
1452 #else // #if CV_NEON_AARCH64
1453  return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
1454 #endif // #if CV_NEON_AARCH64
1455 }
1456 #if CV_SIMD128_64F
1457 inline double v_reduce_sum(const v_float64x2& a)
1458 {
1459  return vaddvq_f64(a.val);
1460 }
1461 #endif
1462 
1463 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1464  const v_float32x4& c, const v_float32x4& d)
1465 {
1466 #if CV_NEON_AARCH64
1467  float32x4_t ab = vpaddq_f32(a.val, b.val); // a0+a1 a2+a3 b0+b1 b2+b3
1468  float32x4_t cd = vpaddq_f32(c.val, d.val); // c0+c1 d0+d1 c2+c3 d2+d3
1469  return v_float32x4(vpaddq_f32(ab, cd)); // sumA sumB sumC sumD
1470 #else // #if CV_NEON_AARCH64
1471  float32x4x2_t ab = vtrnq_f32(a.val, b.val);
1472  float32x4x2_t cd = vtrnq_f32(c.val, d.val);
1473 
1474  float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
1475  float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
1476 
1477  float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
1478  float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
1479 
1480  return v_float32x4(vaddq_f32(v0, v1));
1481 #endif // #if CV_NEON_AARCH64
1482 }
1483 
1484 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1485 {
1486 #if CV_NEON_AARCH64
1487  uint8x16_t t0 = vabdq_u8(a.val, b.val);
1488  uint16_t t1 = vaddlvq_u8(t0);
1489  return t1;
1490 #else // #if CV_NEON_AARCH64
1491  uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
1492  uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1493  return vget_lane_u32(vpadd_u32(t1, t1), 0);
1494 #endif // #if CV_NEON_AARCH64
1495 }
1496 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1497 {
1498 #if CV_NEON_AARCH64
1499  uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
1500  uint16_t t1 = vaddlvq_u8(t0);
1501  return t1;
1502 #else // #if CV_NEON_AARCH64
1503  uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
1504  uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1505  return vget_lane_u32(vpadd_u32(t1, t1), 0);
1506 #endif // #if CV_NEON_AARCH64
1507 }
1508 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1509 {
1510 #if CV_NEON_AARCH64
1511  uint16x8_t t0 = vabdq_u16(a.val, b.val);
1512  uint32_t t1 = vaddlvq_u16(t0);
1513  return t1;
1514 #else // #if CV_NEON_AARCH64
1515  uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
1516  uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1517  return vget_lane_u32(vpadd_u32(t1, t1), 0);
1518 #endif // #if CV_NEON_AARCH64
1519 }
1520 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1521 {
1522 #if CV_NEON_AARCH64
1523  uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
1524  uint32_t t1 = vaddlvq_u16(t0);
1525  return t1;
1526 #else // #if CV_NEON_AARCH64
1527  uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
1528  uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1529  return vget_lane_u32(vpadd_u32(t1, t1), 0);
1530 #endif // #if CV_NEON_AARCH64
1531 }
1532 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1533 {
1534 #if CV_NEON_AARCH64
1535  uint32x4_t t0 = vabdq_u32(a.val, b.val);
1536  uint32_t t1 = vaddvq_u32(t0);
1537  return t1;
1538 #else // #if CV_NEON_AARCH64
1539  uint32x4_t t0 = vabdq_u32(a.val, b.val);
1540  uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1541  return vget_lane_u32(vpadd_u32(t1, t1), 0);
1542 #endif // #if CV_NEON_AARCH64
1543 }
1544 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1545 {
1546 #if CV_NEON_AARCH64
1547  uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1548  uint32_t t1 = vaddvq_u32(t0);
1549  return t1;
1550 #else // #if CV_NEON_AARCH64
1551  uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1552  uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1553  return vget_lane_u32(vpadd_u32(t1, t1), 0);
1554 #endif // #if CV_NEON_AARCH64
1555 }
1556 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1557 {
1558 #if CV_NEON_AARCH64
1559  float32x4_t t0 = vabdq_f32(a.val, b.val);
1560  return vaddvq_f32(t0);
1561 #else // #if CV_NEON_AARCH64
1562  float32x4_t t0 = vabdq_f32(a.val, b.val);
1563  float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
1564  return vget_lane_f32(vpadd_f32(t1, t1), 0);
1565 #endif // #if CV_NEON_AARCH64
1566 }
1567 
1568 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1569 { return v_uint8x16(vcntq_u8(a.val)); }
1570 inline v_uint8x16 v_popcount(const v_int8x16& a)
1571 { return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
1572 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1573 { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
1574 inline v_uint16x8 v_popcount(const v_int16x8& a)
1575 { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
1576 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1577 { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
1578 inline v_uint32x4 v_popcount(const v_int32x4& a)
1579 { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
1580 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1581 { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
1582 inline v_uint64x2 v_popcount(const v_int64x2& a)
1583 { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
1584 
1585 inline int v_signmask(const v_uint8x16& a)
1586 {
1587 #if CV_NEON_AARCH64
1588  const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
1589  const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
1590  uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
1591  uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
1592  uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
1593  return t0;
1594 #else // #if CV_NEON_AARCH64
1595  int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
1596  uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
1597  uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
1598  return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
1599 #endif // #if CV_NEON_AARCH64
1600 }
1601 
1602 inline int v_signmask(const v_int8x16& a)
1603 { return v_signmask(v_reinterpret_as_u8(a)); }
1604 
1605 inline int v_signmask(const v_uint16x8& a)
1606 {
1607 #if CV_NEON_AARCH64
1608  const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
1609  uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
1610  uint32_t t0 = vaddlvq_u16(v0);
1611  return t0;
1612 #else // #if CV_NEON_AARCH64
1613  int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
1614  uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
1615  uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
1616  return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
1617 #endif // #if CV_NEON_AARCH64
1618 }
1619 inline int v_signmask(const v_int16x8& a)
1620 { return v_signmask(v_reinterpret_as_u16(a)); }
1621 
1622 inline int v_signmask(const v_uint32x4& a)
1623 {
1624 #if CV_NEON_AARCH64
1625  const int32x4_t signPosition = {0,1,2,3};
1626  uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
1627  uint32_t t0 = vaddvq_u32(v0);
1628  return t0;
1629 #else // #if CV_NEON_AARCH64
1630  int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
1631  uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
1632  uint64x2_t v1 = vpaddlq_u32(v0);
1633  return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
1634 #endif // #if CV_NEON_AARCH64
1635 }
1636 inline int v_signmask(const v_int32x4& a)
1637 { return v_signmask(v_reinterpret_as_u32(a)); }
1638 inline int v_signmask(const v_float32x4& a)
1639 { return v_signmask(v_reinterpret_as_u32(a)); }
1640 inline int v_signmask(const v_uint64x2& a)
1641 {
1642 #if CV_NEON_AARCH64
1643  const int64x2_t signPosition = {0,1};
1644  uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
1645  int t0 = (int)vaddvq_u64(v0);
1646  return t0;
1647 #else // #if CV_NEON_AARCH64
1648  int64x1_t m0 = vdup_n_s64(0);
1649  uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1650  return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1651 #endif // #if CV_NEON_AARCH64
1652 }
1653 inline int v_signmask(const v_int64x2& a)
1654 { return v_signmask(v_reinterpret_as_u64(a)); }
1655 #if CV_SIMD128_64F
1656 inline int v_signmask(const v_float64x2& a)
1657 { return v_signmask(v_reinterpret_as_u64(a)); }
1658 #endif
1659 
1660 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
1661 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
1662 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
1663 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
1664 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
1665 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
1666 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1667 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1668 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1669 #if CV_SIMD128_64F
1670 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1671 #endif
1672 
1673 #if CV_NEON_AARCH64
1674  #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1675  inline bool v_check_all(const v_##_Tpvec& a) \
1676  { \
1677  return (vminvq_##suffix(a.val) >> shift) != 0; \
1678  } \
1679  inline bool v_check_any(const v_##_Tpvec& a) \
1680  { \
1681  return (vmaxvq_##suffix(a.val) >> shift) != 0; \
1682  }
1683 #else // #if CV_NEON_AARCH64
1684  #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1685  inline bool v_check_all(const v_##_Tpvec& a) \
1686  { \
1687  _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1688  uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1689  return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1690  } \
1691  inline bool v_check_any(const v_##_Tpvec& a) \
1692  { \
1693  _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1694  uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1695  return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1696  }
1697 #endif // #if CV_NEON_AARCH64
1698 
1699 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1700 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1701 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1702 
1703 inline bool v_check_all(const v_uint64x2& a)
1704 {
1705  uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1706  return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
1707 }
1708 inline bool v_check_any(const v_uint64x2& a)
1709 {
1710  uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1711  return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
1712 }
1713 
1714 inline bool v_check_all(const v_int8x16& a)
1715 { return v_check_all(v_reinterpret_as_u8(a)); }
1716 inline bool v_check_all(const v_int16x8& a)
1717 { return v_check_all(v_reinterpret_as_u16(a)); }
1718 inline bool v_check_all(const v_int32x4& a)
1719 { return v_check_all(v_reinterpret_as_u32(a)); }
1720 inline bool v_check_all(const v_float32x4& a)
1721 { return v_check_all(v_reinterpret_as_u32(a)); }
1722 
1723 inline bool v_check_any(const v_int8x16& a)
1724 { return v_check_any(v_reinterpret_as_u8(a)); }
1725 inline bool v_check_any(const v_int16x8& a)
1726 { return v_check_any(v_reinterpret_as_u16(a)); }
1727 inline bool v_check_any(const v_int32x4& a)
1728 { return v_check_any(v_reinterpret_as_u32(a)); }
1729 inline bool v_check_any(const v_float32x4& a)
1730 { return v_check_any(v_reinterpret_as_u32(a)); }
1731 
1732 inline bool v_check_all(const v_int64x2& a)
1733 { return v_check_all(v_reinterpret_as_u64(a)); }
1734 inline bool v_check_any(const v_int64x2& a)
1735 { return v_check_any(v_reinterpret_as_u64(a)); }
1736 #if CV_SIMD128_64F
1737 inline bool v_check_all(const v_float64x2& a)
1738 { return v_check_all(v_reinterpret_as_u64(a)); }
1739 inline bool v_check_any(const v_float64x2& a)
1740 { return v_check_any(v_reinterpret_as_u64(a)); }
1741 #endif
1742 
1743 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1744 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1745 { \
1746  return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1747 }
1748 
1749 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1750 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1751 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1752 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1753 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1754 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1755 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1756 #if CV_SIMD128_64F
1757 OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1758 #endif
1759 
1760 #if CV_NEON_AARCH64
1761 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1762 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1763 { \
1764  b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1765  b1.val = vmovl_high_##suffix(a.val); \
1766 } \
1767 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1768 { \
1769  return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1770 } \
1771 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1772 { \
1773  return _Tpwvec(vmovl_high_##suffix(a.val)); \
1774 } \
1775 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1776 { \
1777  return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1778 }
1779 #else
1780 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1781 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1782 { \
1783  b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1784  b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1785 } \
1786 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1787 { \
1788  return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1789 } \
1790 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1791 { \
1792  return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
1793 } \
1794 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1795 { \
1796  return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1797 }
1798 #endif
1799 
1800 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1801 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1802 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1803 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1804 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1805 OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1806 
1807 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1808 {
1809  typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1810  uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
1811  uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1812  return v_uint32x4(vmovl_u16(v1));
1813 }
1814 
1815 inline v_int32x4 v_load_expand_q(const schar* ptr)
1816 {
1817  typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1818  int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
1819  int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1820  return v_int32x4(vmovl_s16(v1));
1821 }
1822 
1823 #if defined(__aarch64__) || defined(_M_ARM64)
1824 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1825 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1826 { \
1827  b0.val = vzip1q_##suffix(a0.val, a1.val); \
1828  b1.val = vzip2q_##suffix(a0.val, a1.val); \
1829 } \
1830 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1831 { \
1832  return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1833 } \
1834 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1835 { \
1836  return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1837 } \
1838 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1839 { \
1840  c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1841  d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1842 }
1843 #else
1844 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1845 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1846 { \
1847  _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1848  b0.val = p.val[0]; \
1849  b1.val = p.val[1]; \
1850 } \
1851 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1852 { \
1853  return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1854 } \
1855 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1856 { \
1857  return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1858 } \
1859 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1860 { \
1861  c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1862  d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1863 }
1864 #endif
1865 
1866 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1867 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1868 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1869 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1870 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1871 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1872 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1873 #if CV_SIMD128_64F
1874 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1875 #endif
1876 
1877 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1878 {
1879  uint8x16_t vec = vrev64q_u8(a.val);
1880  return v_uint8x16(vextq_u8(vec, vec, 8));
1881 }
1882 
1883 inline v_int8x16 v_reverse(const v_int8x16 &a)
1884 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1885 
1886 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1887 {
1888  uint16x8_t vec = vrev64q_u16(a.val);
1889  return v_uint16x8(vextq_u16(vec, vec, 4));
1890 }
1891 
1892 inline v_int16x8 v_reverse(const v_int16x8 &a)
1893 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1894 
1895 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1896 {
1897  uint32x4_t vec = vrev64q_u32(a.val);
1898  return v_uint32x4(vextq_u32(vec, vec, 2));
1899 }
1900 
1901 inline v_int32x4 v_reverse(const v_int32x4 &a)
1902 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1903 
1904 inline v_float32x4 v_reverse(const v_float32x4 &a)
1905 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1906 
1907 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1908 {
1909  uint64x2_t vec = a.val;
1910  uint64x1_t vec_lo = vget_low_u64(vec);
1911  uint64x1_t vec_hi = vget_high_u64(vec);
1912  return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
1913 }
1914 
1915 inline v_int64x2 v_reverse(const v_int64x2 &a)
1916 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1917 
1918 #if CV_SIMD128_64F
1919 inline v_float64x2 v_reverse(const v_float64x2 &a)
1920 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1921 #endif
1922 
1923 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1924 template <int s> \
1925 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1926 { \
1927  return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1928 }
1929 
1930 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1931 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1932 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1933 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1934 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1935 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1936 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1937 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1938 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1939 #if CV_SIMD128_64F
1940 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1941 #endif
1942 
1943 #define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
1944 template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
1945 
1946 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
1947 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
1948 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
1949 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
1950 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
1951 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
1952 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
1953 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
1954 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
1955 #if CV_SIMD128_64F
1956 OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
1957 #endif
1958 
1959 #define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
1960 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
1961 
1962 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
1963 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
1964 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
1965 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
1966 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
1967 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
1968 OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
1969 OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
1970 OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
1971 #if CV_SIMD128_64F
1972 OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
1973 #endif
1974 
1975 #if CV_SIMD128_64F
1976 inline v_int32x4 v_round(const v_float32x4& a)
1977 {
1978  float32x4_t a_ = a.val;
1979  int32x4_t result;
1980 #if defined _MSC_VER
1981  result = vcvtnq_s32_f32(a_);
1982 #else
1983  __asm__ ("fcvtns %0.4s, %1.4s"
1984  : "=w"(result)
1985  : "w"(a_)
1986  : /* No clobbers */);
1987 #endif
1988  return v_int32x4(result);
1989 }
1990 #else
1991 inline v_int32x4 v_round(const v_float32x4& a)
1992 {
1993  // See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
1994  float32x4_t delta = vdupq_n_f32(12582912.0f);
1995  return v_int32x4(vcvtq_s32_f32(vsubq_f32(vaddq_f32(a.val, delta), delta)));
1996 }
1997 #endif
1998 inline v_int32x4 v_floor(const v_float32x4& a)
1999 {
2000  int32x4_t a1 = vcvtq_s32_f32(a.val);
2001  uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
2002  return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
2003 }
2004 
2005 inline v_int32x4 v_ceil(const v_float32x4& a)
2006 {
2007  int32x4_t a1 = vcvtq_s32_f32(a.val);
2008  uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
2009  return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
2010 }
2011 
2012 inline v_int32x4 v_trunc(const v_float32x4& a)
2013 { return v_int32x4(vcvtq_s32_f32(a.val)); }
2014 
2015 #if CV_SIMD128_64F
2016 inline v_int32x4 v_round(const v_float64x2& a)
2017 {
2018  static const int32x2_t zero = vdup_n_s32(0);
2019  return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), zero));
2020 }
2021 
2022 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2023 {
2024  return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), vmovn_s64(vcvtnq_s64_f64(b.val))));
2025 }
2026 
2027 inline v_int32x4 v_floor(const v_float64x2& a)
2028 {
2029  static const int32x2_t zero = vdup_n_s32(0);
2030  int64x2_t a1 = vcvtq_s64_f64(a.val);
2031  uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
2032  a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
2033  return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
2034 }
2035 
2036 inline v_int32x4 v_ceil(const v_float64x2& a)
2037 {
2038  static const int32x2_t zero = vdup_n_s32(0);
2039  int64x2_t a1 = vcvtq_s64_f64(a.val);
2040  uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
2041  a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
2042  return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
2043 }
2044 
2045 inline v_int32x4 v_trunc(const v_float64x2& a)
2046 {
2047  static const int32x2_t zero = vdup_n_s32(0);
2048  return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
2049 }
2050 #endif
2051 
2052 #if CV_NEON_AARCH64
2053 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2054 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2055  const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2056  v_##_Tpvec& b0, v_##_Tpvec& b1, \
2057  v_##_Tpvec& b2, v_##_Tpvec& b3) \
2058 { \
2059  /* -- Pass 1: 64b transpose */ \
2060  _Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
2061  vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2062  vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2063  _Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
2064  vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2065  vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2066  _Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
2067  vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2068  vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2069  _Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
2070  vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2071  vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2072  /* -- Pass 2: 32b transpose */ \
2073  b0.val = vtrn1q_##suffix##32(t0, t1); \
2074  b1.val = vtrn2q_##suffix##32(t0, t1); \
2075  b2.val = vtrn1q_##suffix##32(t2, t3); \
2076  b3.val = vtrn2q_##suffix##32(t2, t3); \
2077 }
2078 
2079 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
2080 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
2081 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
2082 #else // #if CV_NEON_AARCH64
2083 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2084 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2085  const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2086  v_##_Tpvec& b0, v_##_Tpvec& b1, \
2087  v_##_Tpvec& b2, v_##_Tpvec& b3) \
2088 { \
2089  /* m00 m01 m02 m03 */ \
2090  /* m10 m11 m12 m13 */ \
2091  /* m20 m21 m22 m23 */ \
2092  /* m30 m31 m32 m33 */ \
2093  _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
2094  _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
2095  /* m00 m10 m02 m12 */ \
2096  /* m01 m11 m03 m13 */ \
2097  /* m20 m30 m22 m32 */ \
2098  /* m21 m31 m23 m33 */ \
2099  b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
2100  b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
2101  b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
2102  b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
2103 }
2104 
2105 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
2106 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
2107 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
2108 #endif // #if CV_NEON_AARCH64
2109 
2110 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
2111 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2112 { \
2113  _Tpvec##x2_t v = vld2q_##suffix(ptr); \
2114  a.val = v.val[0]; \
2115  b.val = v.val[1]; \
2116 } \
2117 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2118 { \
2119  _Tpvec##x3_t v = vld3q_##suffix(ptr); \
2120  a.val = v.val[0]; \
2121  b.val = v.val[1]; \
2122  c.val = v.val[2]; \
2123 } \
2124 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2125  v_##_Tpvec& c, v_##_Tpvec& d) \
2126 { \
2127  _Tpvec##x4_t v = vld4q_##suffix(ptr); \
2128  a.val = v.val[0]; \
2129  b.val = v.val[1]; \
2130  c.val = v.val[2]; \
2131  d.val = v.val[3]; \
2132 } \
2133 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2134  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2135 { \
2136  _Tpvec##x2_t v; \
2137  v.val[0] = a.val; \
2138  v.val[1] = b.val; \
2139  vst2q_##suffix(ptr, v); \
2140 } \
2141 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2142  const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2143 { \
2144  _Tpvec##x3_t v; \
2145  v.val[0] = a.val; \
2146  v.val[1] = b.val; \
2147  v.val[2] = c.val; \
2148  vst3q_##suffix(ptr, v); \
2149 } \
2150 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2151  const v_##_Tpvec& c, const v_##_Tpvec& d, \
2152  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2153 { \
2154  _Tpvec##x4_t v; \
2155  v.val[0] = a.val; \
2156  v.val[1] = b.val; \
2157  v.val[2] = c.val; \
2158  v.val[3] = d.val; \
2159  vst4q_##suffix(ptr, v); \
2160 }
2161 
2162 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
2163 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
2164 { \
2165  tp##x1_t a0 = vld1_##suffix(ptr); \
2166  tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2167  tp##x1_t a1 = vld1_##suffix(ptr + 2); \
2168  tp##x1_t b1 = vld1_##suffix(ptr + 3); \
2169  a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2170  b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2171 } \
2172  \
2173 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
2174  v_##tp##x2& b, v_##tp##x2& c ) \
2175 { \
2176  tp##x1_t a0 = vld1_##suffix(ptr); \
2177  tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2178  tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2179  tp##x1_t a1 = vld1_##suffix(ptr + 3); \
2180  tp##x1_t b1 = vld1_##suffix(ptr + 4); \
2181  tp##x1_t c1 = vld1_##suffix(ptr + 5); \
2182  a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2183  b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2184  c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2185 } \
2186  \
2187 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
2188  v_##tp##x2& c, v_##tp##x2& d ) \
2189 { \
2190  tp##x1_t a0 = vld1_##suffix(ptr); \
2191  tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2192  tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2193  tp##x1_t d0 = vld1_##suffix(ptr + 3); \
2194  tp##x1_t a1 = vld1_##suffix(ptr + 4); \
2195  tp##x1_t b1 = vld1_##suffix(ptr + 5); \
2196  tp##x1_t c1 = vld1_##suffix(ptr + 6); \
2197  tp##x1_t d1 = vld1_##suffix(ptr + 7); \
2198  a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2199  b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2200  c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2201  d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
2202 } \
2203  \
2204 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2205  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2206 { \
2207  vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2208  vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2209  vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
2210  vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
2211 } \
2212  \
2213 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
2214  const v_##tp##x2& b, const v_##tp##x2& c, \
2215  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2216 { \
2217  vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2218  vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2219  vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2220  vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
2221  vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
2222  vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
2223 } \
2224  \
2225 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2226  const v_##tp##x2& c, const v_##tp##x2& d, \
2227  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2228 { \
2229  vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2230  vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2231  vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2232  vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
2233  vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
2234  vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
2235  vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
2236  vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
2237 }
2238 
2239 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
2240 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
2241 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
2242 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
2243 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
2244 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
2245 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
2246 #if CV_SIMD128_64F
2247 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
2248 #endif
2249 
2250 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
2251 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
2252 
2253 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2254 {
2255  return v_float32x4(vcvtq_f32_s32(a.val));
2256 }
2257 
2258 #if CV_SIMD128_64F
2259 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2260 {
2261  float32x2_t zero = vdup_n_f32(0.0f);
2262  return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
2263 }
2264 
2265 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2266 {
2267  return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
2268 }
2269 
2270 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2271 {
2272  return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
2273 }
2274 
2275 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2276 {
2277  return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
2278 }
2279 
2280 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2281 {
2282  return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
2283 }
2284 
2285 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2286 {
2287  return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
2288 }
2289 
2290 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2291 { return v_float64x2(vcvtq_f64_s64(a.val)); }
2292 
2293 #endif
2294 
2296 
2297 inline v_int8x16 v_lut(const schar* tab, const int* idx)
2298 {
2299  schar CV_DECL_ALIGNED(32) elems[16] =
2300  {
2301  tab[idx[ 0]],
2302  tab[idx[ 1]],
2303  tab[idx[ 2]],
2304  tab[idx[ 3]],
2305  tab[idx[ 4]],
2306  tab[idx[ 5]],
2307  tab[idx[ 6]],
2308  tab[idx[ 7]],
2309  tab[idx[ 8]],
2310  tab[idx[ 9]],
2311  tab[idx[10]],
2312  tab[idx[11]],
2313  tab[idx[12]],
2314  tab[idx[13]],
2315  tab[idx[14]],
2316  tab[idx[15]]
2317  };
2318  return v_int8x16(vld1q_s8(elems));
2319 }
2320 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
2321 {
2322  schar CV_DECL_ALIGNED(32) elems[16] =
2323  {
2324  tab[idx[0]],
2325  tab[idx[0] + 1],
2326  tab[idx[1]],
2327  tab[idx[1] + 1],
2328  tab[idx[2]],
2329  tab[idx[2] + 1],
2330  tab[idx[3]],
2331  tab[idx[3] + 1],
2332  tab[idx[4]],
2333  tab[idx[4] + 1],
2334  tab[idx[5]],
2335  tab[idx[5] + 1],
2336  tab[idx[6]],
2337  tab[idx[6] + 1],
2338  tab[idx[7]],
2339  tab[idx[7] + 1]
2340  };
2341  return v_int8x16(vld1q_s8(elems));
2342 }
2343 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2344 {
2345  schar CV_DECL_ALIGNED(32) elems[16] =
2346  {
2347  tab[idx[0]],
2348  tab[idx[0] + 1],
2349  tab[idx[0] + 2],
2350  tab[idx[0] + 3],
2351  tab[idx[1]],
2352  tab[idx[1] + 1],
2353  tab[idx[1] + 2],
2354  tab[idx[1] + 3],
2355  tab[idx[2]],
2356  tab[idx[2] + 1],
2357  tab[idx[2] + 2],
2358  tab[idx[2] + 3],
2359  tab[idx[3]],
2360  tab[idx[3] + 1],
2361  tab[idx[3] + 2],
2362  tab[idx[3] + 3]
2363  };
2364  return v_int8x16(vld1q_s8(elems));
2365 }
2366 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
2367 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
2368 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
2369 
2370 inline v_int16x8 v_lut(const short* tab, const int* idx)
2371 {
2372  short CV_DECL_ALIGNED(32) elems[8] =
2373  {
2374  tab[idx[0]],
2375  tab[idx[1]],
2376  tab[idx[2]],
2377  tab[idx[3]],
2378  tab[idx[4]],
2379  tab[idx[5]],
2380  tab[idx[6]],
2381  tab[idx[7]]
2382  };
2383  return v_int16x8(vld1q_s16(elems));
2384 }
2385 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2386 {
2387  short CV_DECL_ALIGNED(32) elems[8] =
2388  {
2389  tab[idx[0]],
2390  tab[idx[0] + 1],
2391  tab[idx[1]],
2392  tab[idx[1] + 1],
2393  tab[idx[2]],
2394  tab[idx[2] + 1],
2395  tab[idx[3]],
2396  tab[idx[3] + 1]
2397  };
2398  return v_int16x8(vld1q_s16(elems));
2399 }
2400 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2401 {
2402  return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
2403 }
2404 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
2405 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
2406 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
2407 
2408 inline v_int32x4 v_lut(const int* tab, const int* idx)
2409 {
2410  int CV_DECL_ALIGNED(32) elems[4] =
2411  {
2412  tab[idx[0]],
2413  tab[idx[1]],
2414  tab[idx[2]],
2415  tab[idx[3]]
2416  };
2417  return v_int32x4(vld1q_s32(elems));
2418 }
2419 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2420 {
2421  return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
2422 }
2423 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2424 {
2425  return v_int32x4(vld1q_s32(tab + idx[0]));
2426 }
2427 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
2428 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
2429 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
2430 
2431 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2432 {
2433  return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
2434 }
2435 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2436 {
2437  return v_int64x2(vld1q_s64(tab + idx[0]));
2438 }
2439 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
2440 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2441 
2442 inline v_float32x4 v_lut(const float* tab, const int* idx)
2443 {
2444  float CV_DECL_ALIGNED(32) elems[4] =
2445  {
2446  tab[idx[0]],
2447  tab[idx[1]],
2448  tab[idx[2]],
2449  tab[idx[3]]
2450  };
2451  return v_float32x4(vld1q_f32(elems));
2452 }
2453 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
2454 {
2455  typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64;
2456 
2457  uint64 CV_DECL_ALIGNED(32) elems[2] =
2458  {
2459  *(unaligned_uint64*)(tab + idx[0]),
2460  *(unaligned_uint64*)(tab + idx[1])
2461  };
2462  return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
2463 }
2464 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
2465 {
2466  return v_float32x4(vld1q_f32(tab + idx[0]));
2467 }
2468 
2469 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2470 {
2471  int CV_DECL_ALIGNED(32) elems[4] =
2472  {
2473  tab[vgetq_lane_s32(idxvec.val, 0)],
2474  tab[vgetq_lane_s32(idxvec.val, 1)],
2475  tab[vgetq_lane_s32(idxvec.val, 2)],
2476  tab[vgetq_lane_s32(idxvec.val, 3)]
2477  };
2478  return v_int32x4(vld1q_s32(elems));
2479 }
2480 
2481 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2482 {
2483  unsigned CV_DECL_ALIGNED(32) elems[4] =
2484  {
2485  tab[vgetq_lane_s32(idxvec.val, 0)],
2486  tab[vgetq_lane_s32(idxvec.val, 1)],
2487  tab[vgetq_lane_s32(idxvec.val, 2)],
2488  tab[vgetq_lane_s32(idxvec.val, 3)]
2489  };
2490  return v_uint32x4(vld1q_u32(elems));
2491 }
2492 
2493 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2494 {
2495  float CV_DECL_ALIGNED(32) elems[4] =
2496  {
2497  tab[vgetq_lane_s32(idxvec.val, 0)],
2498  tab[vgetq_lane_s32(idxvec.val, 1)],
2499  tab[vgetq_lane_s32(idxvec.val, 2)],
2500  tab[vgetq_lane_s32(idxvec.val, 3)]
2501  };
2502  return v_float32x4(vld1q_f32(elems));
2503 }
2504 
2505 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2506 {
2507  /*int CV_DECL_ALIGNED(32) idx[4];
2508  v_store(idx, idxvec);
2509 
2510  float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
2511  float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
2512 
2513  float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
2514  x = v_float32x4(xxyy.val[0]);
2515  y = v_float32x4(xxyy.val[1]);*/
2516  int CV_DECL_ALIGNED(32) idx[4];
2517  v_store_aligned(idx, idxvec);
2518 
2519  x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2520  y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
2521 }
2522 
2523 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2524 {
2525  return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
2526 }
2527 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
2528 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2529 {
2530  return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
2531 }
2532 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2533 
2534 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2535 {
2536  return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
2537 }
2538 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2539 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2540 {
2541  int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
2542  return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
2543 }
2544 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2545 
2546 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2547 {
2548  int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
2549  return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
2550 }
2551 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2552 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2553 
2554 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2555 {
2556  return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
2557 }
2558 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2559 
2560 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2561 {
2562  return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
2563 }
2564 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2565 
2566 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2567 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2568 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2569 
2570 #if CV_SIMD128_64F
2571 inline v_float64x2 v_lut(const double* tab, const int* idx)
2572 {
2573  double CV_DECL_ALIGNED(32) elems[2] =
2574  {
2575  tab[idx[0]],
2576  tab[idx[1]]
2577  };
2578  return v_float64x2(vld1q_f64(elems));
2579 }
2580 
2581 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2582 {
2583  return v_float64x2(vld1q_f64(tab + idx[0]));
2584 }
2585 
2586 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2587 {
2588  double CV_DECL_ALIGNED(32) elems[2] =
2589  {
2590  tab[vgetq_lane_s32(idxvec.val, 0)],
2591  tab[vgetq_lane_s32(idxvec.val, 1)],
2592  };
2593  return v_float64x2(vld1q_f64(elems));
2594 }
2595 
2596 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2597 {
2598  int CV_DECL_ALIGNED(32) idx[4];
2599  v_store_aligned(idx, idxvec);
2600 
2601  x = v_float64x2(tab[idx[0]], tab[idx[1]]);
2602  y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
2603 }
2604 #endif
2605 
2607 #if CV_FP16
2608 inline v_float32x4 v_load_expand(const hfloat* ptr)
2609 {
2610  float16x4_t v =
2611  #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
2612  (float16x4_t)vld1_s16((const short*)ptr);
2613  #else
2614  vld1_f16((const __fp16*)ptr);
2615  #endif
2616  return v_float32x4(vcvt_f32_f16(v));
2617 }
2618 
2619 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2620 {
2621  float16x4_t hv = vcvt_f16_f32(v.val);
2622 
2623  #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
2624  vst1_s16((short*)ptr, (int16x4_t)hv);
2625  #else
2626  vst1_f16((__fp16*)ptr, hv);
2627  #endif
2628 }
2629 #else
2630 inline v_float32x4 v_load_expand(const hfloat* ptr)
2631 {
2632  const int N = 4;
2633  float buf[N];
2634  for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2635  return v_load(buf);
2636 }
2637 
2638 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2639 {
2640  const int N = 4;
2641  float buf[N];
2642  v_store(buf, v);
2643  for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
2644 }
2645 #endif
2646 
2647 inline void v_cleanup() {}
2648 
2649 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2650 
2652 
2653 }
2654 
2655 #endif
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray T
Definition: calib3d.hpp:1867
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
#define CV_BIG_UINT(n)
Definition: interface.h:64
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition: intrin_cpp.hpp:1584
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition: intrin_cpp.hpp:2190
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition: intrin_cpp.hpp:1020
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition: intrin_cpp.hpp:1033
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
OutputArray sum
Definition: imgproc.hpp:2882
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437