EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_neon.hpp
1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5// By downloading, copying, installing or using the software you agree to this license.
6// If you do not agree to this license, do not download, install,
7// copy or use the software.
8//
9//
10// License Agreement
11// For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16// Copyright (C) 2015, Itseez Inc., all rights reserved.
17// Third party copyrights are property of their respective owners.
18//
19// Redistribution and use in source and binary forms, with or without modification,
20// are permitted provided that the following conditions are met:
21//
22// * Redistribution's of source code must retain the above copyright notice,
23// this list of conditions and the following disclaimer.
24//
25// * Redistribution's in binary form must reproduce the above copyright notice,
26// this list of conditions and the following disclaimer in the documentation
27// and/or other materials provided with the distribution.
28//
29// * The name of the copyright holders may not be used to endorse or promote products
30// derived from this software without specific prior written permission.
31//
32// This software is provided by the copyright holders and contributors "as is" and
33// any express or implied warranties, including, but not limited to, the implied
34// warranties of merchantability and fitness for a particular purpose are disclaimed.
35// In no event shall the Intel Corporation or contributors be liable for any direct,
36// indirect, incidental, special, exemplary, or consequential damages
37// (including, but not limited to, procurement of substitute goods or services;
38// loss of use, data, or profits; or business interruption) however caused
39// and on any theory of liability, whether in contract, strict liability,
40// or tort (including negligence or otherwise) arising in any way out of
41// the use of this software, even if advised of the possibility of such damage.
42//
43//M*/
44
45#ifndef OPENCV_HAL_INTRIN_NEON_HPP
46#define OPENCV_HAL_INTRIN_NEON_HPP
47
48#include <algorithm>
49#include "opencv2/core/utility.hpp"
50
51namespace cv
52{
53
55
56CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
57
58#define CV_SIMD128 1
59#if defined(__aarch64__) || defined(_M_ARM64)
60#define CV_SIMD128_64F 1
61#else
62#define CV_SIMD128_64F 0
63#endif
64
65// The following macro checks if the code is being compiled for the
66// AArch64 execution state of Armv8, to enable the 128-bit
67// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
68// the Arm C Language Extension (ACLE) specifications [1] to check the
69// availability of 128-bit intrinsics, and it is supporrted by clang
70// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
71// Visual Studio [2] .
72//
73// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
74// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
75#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
76#define CV_NEON_AARCH64 1
77#else
78#define CV_NEON_AARCH64 0
79#endif
80
81
83
84#if CV_SIMD128_64F
85#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
86 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
87 { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
88#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
89 inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
90 { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
91#else
92#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
93 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
94 { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
95#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
96 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
97 { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
98#endif
99
100#if CV_SIMD128_64F
101#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
102 template <typename T> static inline \
103 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
104 template <typename T> static inline \
105 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
106#else
107#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
108#endif
109
110#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
111 OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
112 OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
113 OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
114
115#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
116 OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
117
118#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
119 OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
120
121OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8, u8)
122OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16, int8x8, s8)
123OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
124OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16)
125OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
126OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32)
127OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
128OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
129OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
130#if CV_SIMD128_64F
131OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
132#endif
133
135template<typename T> struct VTraits {
136 static inline int vlanes() { return T::nlanes; }
137 enum { max_nlanes = T::nlanes, nlanes = T::nlanes };
138 using lane_type = typename T::lane_type;
139};
140
141template<typename T>
142inline typename VTraits<T>::lane_type v_get0(const T& v) \
143{ \
144 return v.get0(); \
145}
147
148struct v_uint8x16
149{
150 v_uint8x16() {}
151 explicit v_uint8x16(uint8x16_t v) : val(v) {}
152 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
153 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
154 {
155 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
156 val = vld1q_u8(v);
157 }
158 uint8x16_t val;
159
160private:
161 friend struct VTraits<v_uint8x16>;
162 enum { nlanes = 16 };
163 typedef uchar lane_type;
164
165 friend typename VTraits<v_uint8x16>::lane_type v_get0<v_uint8x16>(const v_uint8x16& v);
166 uchar get0() const
167 {
168 return vgetq_lane_u8(val, 0);
169 }
170};
171
172struct v_int8x16
173{
174 v_int8x16() {}
175 explicit v_int8x16(int8x16_t v) : val(v) {}
176 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
177 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
178 {
179 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
180 val = vld1q_s8(v);
181 }
182 int8x16_t val;
183
184private:
185 friend struct VTraits<v_int8x16>;
186 enum { nlanes = 16 };
187 typedef schar lane_type;
188
189 friend typename VTraits<v_int8x16>::lane_type v_get0<v_int8x16>(const v_int8x16& v);
190 schar get0() const
191 {
192 return vgetq_lane_s8(val, 0);
193 }
194};
195
196struct v_uint16x8
197{
198 v_uint16x8() {}
199 explicit v_uint16x8(uint16x8_t v) : val(v) {}
200 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
201 {
202 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
203 val = vld1q_u16(v);
204 }
205 uint16x8_t val;
206
207private:
208 friend struct VTraits<v_uint16x8>;
209 enum { nlanes = 8 };
210 typedef ushort lane_type;
211
212 friend typename VTraits<v_uint16x8>::lane_type v_get0<v_uint16x8>(const v_uint16x8& v);
213 ushort get0() const
214 {
215 return vgetq_lane_u16(val, 0);
216 }
217};
218
219struct v_int16x8
220{
221 v_int16x8() {}
222 explicit v_int16x8(int16x8_t v) : val(v) {}
223 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
224 {
225 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
226 val = vld1q_s16(v);
227 }
228 int16x8_t val;
229
230private:
231 friend struct VTraits<v_int16x8>;
232 enum { nlanes = 8 };
233 typedef short lane_type;
234
235 friend typename VTraits<v_int16x8>::lane_type v_get0<v_int16x8>(const v_int16x8& v);
236 short get0() const
237 {
238 return vgetq_lane_s16(val, 0);
239 }
240};
241
242struct v_uint32x4
243{
244 v_uint32x4() {}
245 explicit v_uint32x4(uint32x4_t v) : val(v) {}
246 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
247 {
248 unsigned v[] = {v0, v1, v2, v3};
249 val = vld1q_u32(v);
250 }
251 uint32x4_t val;
252
253private:
254 friend struct VTraits<v_uint32x4>;
255 enum { nlanes = 4 };
256 typedef unsigned lane_type;
257
258 friend typename VTraits<v_uint32x4>::lane_type v_get0<v_uint32x4>(const v_uint32x4& v);
259 unsigned get0() const
260 {
261 return vgetq_lane_u32(val, 0);
262 }
263};
264
265struct v_int32x4
266{
267 v_int32x4() {}
268 explicit v_int32x4(int32x4_t v) : val(v) {}
269 v_int32x4(int v0, int v1, int v2, int v3)
270 {
271 int v[] = {v0, v1, v2, v3};
272 val = vld1q_s32(v);
273 }
274 int32x4_t val;
275
276private:
277 friend struct VTraits<v_int32x4>;
278 enum { nlanes = 4 };
279 typedef int lane_type;
280
281 friend typename VTraits<v_int32x4>::lane_type v_get0<v_int32x4>(const v_int32x4& v);
282 int get0() const
283 {
284 return vgetq_lane_s32(val, 0);
285 }
286};
287
288struct v_float32x4
289{
290 v_float32x4() {}
291 explicit v_float32x4(float32x4_t v) : val(v) {}
292 v_float32x4(float v0, float v1, float v2, float v3)
293 {
294 float v[] = {v0, v1, v2, v3};
295 val = vld1q_f32(v);
296 }
297 float32x4_t val;
298
299private:
300 friend struct VTraits<v_float32x4>;
301 enum { nlanes = 4 };
302 typedef float lane_type;
303
304 friend typename VTraits<v_float32x4>::lane_type v_get0<v_float32x4>(const v_float32x4& v);
305 float get0() const
306 {
307 return vgetq_lane_f32(val, 0);
308 }
309};
310
311struct v_uint64x2
312{
313 v_uint64x2() {}
314 explicit v_uint64x2(uint64x2_t v) : val(v) {}
315 v_uint64x2(uint64 v0, uint64 v1)
316 {
317 uint64 v[] = {v0, v1};
318 val = vld1q_u64(v);
319 }
320 uint64x2_t val;
321private:
322 friend struct VTraits<v_uint64x2>;
323 enum { nlanes = 2 };
324 typedef uint64 lane_type;
325
326 friend typename VTraits<v_uint64x2>::lane_type v_get0<v_uint64x2>(const v_uint64x2& v);
327 uint64 get0() const
328 {
329 return vgetq_lane_u64(val, 0);
330 }
331};
332
333struct v_int64x2
334{
335 v_int64x2() {}
336 explicit v_int64x2(int64x2_t v) : val(v) {}
337 v_int64x2(int64 v0, int64 v1)
338 {
339 int64 v[] = {v0, v1};
340 val = vld1q_s64(v);
341 }
342 int64x2_t val;
343
344private:
345 friend struct VTraits<v_int64x2>;
346 enum { nlanes = 2 };
347 typedef int64 lane_type;
348
349 friend typename VTraits<v_int64x2>::lane_type v_get0<v_int64x2>(const v_int64x2& v);
350 int64 get0() const
351 {
352 return vgetq_lane_s64(val, 0);
353 }
354};
355
356#if CV_SIMD128_64F
357struct v_float64x2
358{
359 v_float64x2() {}
360 explicit v_float64x2(float64x2_t v) : val(v) {}
361 v_float64x2(double v0, double v1)
362 {
363 double v[] = {v0, v1};
364 val = vld1q_f64(v);
365 }
366
367 float64x2_t val;
368private:
369 friend struct VTraits<v_float64x2>;
370 enum { nlanes = 2 };
371 typedef double lane_type;
372
373 friend typename VTraits<v_float64x2>::lane_type v_get0<v_float64x2>(const v_float64x2& v);
374 double get0() const
375 {
376 return vgetq_lane_f64(val, 0);
377 }
378};
379#endif
380
381#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
382inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
383inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
384inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
385inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
386inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
387inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
388inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
389inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
390inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
391inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
392inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
393inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
394
395OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
396OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
397OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
398OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
399OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
400OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
401OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
402OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
403OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
404#if CV_SIMD128_64F
405#define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
406inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
407OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
408OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
409OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
410OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
411OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
412OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
413OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
414OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
415OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
416OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
417OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
418#endif
419
420#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
421inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
422{ \
423 hreg a1 = mov(a.val), b1 = mov(b.val); \
424 return _Tpvec(vcombine_##suffix(a1, b1)); \
425} \
426inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
427{ \
428 hreg a1 = mov(a.val); \
429 vst1_##suffix(ptr, a1); \
430} \
431template<int n> inline \
432_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
433{ \
434 hreg a1 = rshr(a.val, n); \
435 hreg b1 = rshr(b.val, n); \
436 return _Tpvec(vcombine_##suffix(a1, b1)); \
437} \
438template<int n> inline \
439void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
440{ \
441 hreg a1 = rshr(a.val, n); \
442 vst1_##suffix(ptr, a1); \
443}
444
445OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
446OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
447OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
448OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
449OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
450OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
451
452OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
453OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
454
455// pack boolean
456inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
457{
458 uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
459 return v_uint8x16(ab);
460}
461
462inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
463 const v_uint32x4& c, const v_uint32x4& d)
464{
465 uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
466 uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
467 return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
468}
469
470inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
471 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
472 const v_uint64x2& g, const v_uint64x2& h)
473{
474 uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
475 uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
476 uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
477 uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
478
479 uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
480 uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
481 return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
482}
483
484inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
485 const v_float32x4& m1, const v_float32x4& m2,
486 const v_float32x4& m3)
487{
488 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
489 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
490 res = vmlaq_lane_f32(res, m1.val, vl, 1);
491 res = vmlaq_lane_f32(res, m2.val, vh, 0);
492 res = vmlaq_lane_f32(res, m3.val, vh, 1);
493 return v_float32x4(res);
494}
495
496inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
497 const v_float32x4& m1, const v_float32x4& m2,
498 const v_float32x4& a)
499{
500 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
501 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
502 res = vmlaq_lane_f32(res, m1.val, vl, 1);
503 res = vmlaq_lane_f32(res, m2.val, vh, 0);
504 res = vaddq_f32(res, a.val);
505 return v_float32x4(res);
506}
507
508#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
509inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \
510{ \
511 return _Tpvec(intrin(a.val, b.val)); \
512}
513
514OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint8x16, vqaddq_u8)
515OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint8x16, vqsubq_u8)
516OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int8x16, vqaddq_s8)
517OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int8x16, vqsubq_s8)
518OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint16x8, vqaddq_u16)
519OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint16x8, vqsubq_u16)
520OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int16x8, vqaddq_s16)
521OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int16x8, vqsubq_s16)
522OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int32x4, vaddq_s32)
523OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int32x4, vsubq_s32)
524OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32)
525OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32)
526OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32)
527OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32)
528OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32)
529OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32)
530OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32)
531OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int64x2, vaddq_s64)
532OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int64x2, vsubq_s64)
533OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint64x2, vaddq_u64)
534OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint64x2, vsubq_u64)
535#if CV_SIMD128_64F
536OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float32x4, vdivq_f32)
537OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float64x2, vaddq_f64)
538OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float64x2, vsubq_f64)
539OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float64x2, vmulq_f64)
540OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float64x2, vdivq_f64)
541#else
542inline v_float32x4 v_div (const v_float32x4& a, const v_float32x4& b)
543{
544 float32x4_t reciprocal = vrecpeq_f32(b.val);
545 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
546 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
547 return v_float32x4(vmulq_f32(a.val, reciprocal));
548}
549#endif
550
551// saturating multiply 8-bit, 16-bit
552#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
553 inline _Tpvec v_mul (const _Tpvec& a, const _Tpvec& b) \
554 { \
555 _Tpwvec c, d; \
556 v_mul_expand(a, b, c, d); \
557 return v_pack(c, d); \
558 }
559
560OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16, v_int16x8)
561OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
562OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8, v_int32x4)
563OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
564
565// Multiply and expand
566inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
567 v_int16x8& c, v_int16x8& d)
568{
569 c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
570#if CV_NEON_AARCH64
571 d.val = vmull_high_s8(a.val, b.val);
572#else // #if CV_NEON_AARCH64
573 d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
574#endif // #if CV_NEON_AARCH64
575}
576
577inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
578 v_uint16x8& c, v_uint16x8& d)
579{
580 c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
581#if CV_NEON_AARCH64
582 d.val = vmull_high_u8(a.val, b.val);
583#else // #if CV_NEON_AARCH64
584 d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
585#endif // #if CV_NEON_AARCH64
586}
587
588inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
589 v_int32x4& c, v_int32x4& d)
590{
591 c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
592#if CV_NEON_AARCH64
593 d.val = vmull_high_s16(a.val, b.val);
594#else // #if CV_NEON_AARCH64
595 d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
596#endif // #if CV_NEON_AARCH64
597}
598
599inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
600 v_uint32x4& c, v_uint32x4& d)
601{
602 c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
603#if CV_NEON_AARCH64
604 d.val = vmull_high_u16(a.val, b.val);
605#else // #if CV_NEON_AARCH64
606 d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
607#endif // #if CV_NEON_AARCH64
608}
609
610inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
611 v_uint64x2& c, v_uint64x2& d)
612{
613 c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
614#if CV_NEON_AARCH64
615 d.val = vmull_high_u32(a.val, b.val);
616#else // #if CV_NEON_AARCH64
617 d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
618#endif // #if CV_NEON_AARCH64
619}
620
621inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
622{
623#if CV_NEON_AARCH64
624 int32x4_t c = vmull_high_s16(a.val, b.val);
625#else // #if CV_NEON_AARCH64
626 int32x4_t c = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
627#endif // #if CV_NEON_AARCH64
628 return v_int16x8(vcombine_s16(
629 vshrn_n_s32(vmull_s16( vget_low_s16(a.val), vget_low_s16(b.val)), 16),
630 vshrn_n_s32(c, 16)
631 ));
632}
633inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
634{
635#if CV_NEON_AARCH64
636 uint32x4_t c = vmull_high_u16(a.val, b.val);
637#else // #if CV_NEON_AARCH64
638 uint32x4_t c = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
639#endif // #if CV_NEON_AARCH64
640 return v_uint16x8(vcombine_u16(
641 vshrn_n_u32(vmull_u16( vget_low_u16(a.val), vget_low_u16(b.val)), 16),
642 vshrn_n_u32(c, 16)
643 ));
644}
645
647
648// 16 >> 32
649inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
650{
651 int16x8_t uzp1, uzp2;
652 _v128_unzip(a.val, b.val, uzp1, uzp2);
653 int16x4_t a0 = vget_low_s16(uzp1);
654 int16x4_t b0 = vget_high_s16(uzp1);
655 int16x4_t a1 = vget_low_s16(uzp2);
656 int16x4_t b1 = vget_high_s16(uzp2);
657 int32x4_t p = vmull_s16(a0, b0);
658 return v_int32x4(vmlal_s16(p, a1, b1));
659}
660inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
661{
662 int16x8_t uzp1, uzp2;
663 _v128_unzip(a.val, b.val, uzp1, uzp2);
664 int16x4_t a0 = vget_low_s16(uzp1);
665 int16x4_t b0 = vget_high_s16(uzp1);
666 int16x4_t a1 = vget_low_s16(uzp2);
667 int16x4_t b1 = vget_high_s16(uzp2);
668 int32x4_t p = vmlal_s16(c.val, a0, b0);
669 return v_int32x4(vmlal_s16(p, a1, b1));
670}
671
672// 32 >> 64
673inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
674{
675 int32x4_t uzp1, uzp2;
676 _v128_unzip(a.val, b.val, uzp1, uzp2);
677 int32x2_t a0 = vget_low_s32(uzp1);
678 int32x2_t b0 = vget_high_s32(uzp1);
679 int32x2_t a1 = vget_low_s32(uzp2);
680 int32x2_t b1 = vget_high_s32(uzp2);
681 int64x2_t p = vmull_s32(a0, b0);
682 return v_int64x2(vmlal_s32(p, a1, b1));
683}
684inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
685{
686 int32x4_t uzp1, uzp2;
687 _v128_unzip(a.val, b.val, uzp1, uzp2);
688 int32x2_t a0 = vget_low_s32(uzp1);
689 int32x2_t b0 = vget_high_s32(uzp1);
690 int32x2_t a1 = vget_low_s32(uzp2);
691 int32x2_t b1 = vget_high_s32(uzp2);
692 int64x2_t p = vmlal_s32(c.val, a0, b0);
693 return v_int64x2(vmlal_s32(p, a1, b1));
694}
695
696// 8 >> 32
697#ifdef CV_NEON_DOT
698#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \
699inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b) \
700{ \
701 return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\
702} \
703inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
704{ \
705 return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \
706}
707
708OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_uint32x4, v_uint8x16, u32)
709OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_int32x4, v_int8x16, s32)
710#else
711inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
712{
713 const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
714 const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
715 const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
716 const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
717
718 uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
719 vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
720 uint16x8_t odd = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
721 vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
722
723 uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
724 vreinterpretq_u32_u16(vbslq_u16(mask32, odd, zero32)));
725 uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
726 vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
727 return v_uint32x4(vaddq_u32(s0, s1));
728}
729inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
730 const v_uint32x4& c)
731{
732 return v_add(v_dotprod_expand(a, b), c);
733}
734
735inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
736{
737 int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
738 int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
739 int16x8_t uzp1, uzp2;
740 _v128_unzip(p0, p1, uzp1, uzp2);
741 int16x8_t sum = vaddq_s16(uzp1, uzp2);
742 int16x4_t uzpl1, uzpl2;
743 _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
744 return v_int32x4(vaddl_s16(uzpl1, uzpl2));
745}
746inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
747 const v_int32x4& c)
748{
749 return v_add(v_dotprod_expand(a, b), c);
750}
751#endif
752// 16 >> 64
753inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
754{
755 const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
756 const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
757
758 uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
759 vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
760 uint32x4_t odd = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
761 vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
762 uint32x4_t uzp1, uzp2;
763 _v128_unzip(even, odd, uzp1, uzp2);
764 uint64x2_t s0 = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
765 uint64x2_t s1 = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
766 return v_uint64x2(vaddq_u64(s0, s1));
767}
768inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
769{ return v_add(v_dotprod_expand(a, b), c); }
770
771inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
772{
773 int32x4_t p0 = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
774 int32x4_t p1 = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
775
776 int32x4_t uzp1, uzp2;
777 _v128_unzip(p0, p1, uzp1, uzp2);
778 int32x4_t sum = vaddq_s32(uzp1, uzp2);
779
780 int32x2_t uzpl1, uzpl2;
781 _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
782 return v_int64x2(vaddl_s32(uzpl1, uzpl2));
783}
784inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
785 const v_int64x2& c)
786{ return v_add(v_dotprod_expand(a, b), c); }
787
788// 32 >> 64f
789#if CV_SIMD128_64F
790inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
791{ return v_cvt_f64(v_dotprod(a, b)); }
792inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
793 const v_float64x2& c)
794{ return v_add(v_dotprod_expand(a, b), c); }
795#endif
796
798
799// 16 >> 32
800inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
801{
802#if CV_NEON_AARCH64
803 int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
804 return v_int32x4(vmlal_high_s16(p, a.val, b.val));
805#else
806 int16x4_t a0 = vget_low_s16(a.val);
807 int16x4_t a1 = vget_high_s16(a.val);
808 int16x4_t b0 = vget_low_s16(b.val);
809 int16x4_t b1 = vget_high_s16(b.val);
810 int32x4_t p = vmull_s16(a0, b0);
811 return v_int32x4(vmlal_s16(p, a1, b1));
812#endif
813}
814inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
815{
816#if CV_NEON_AARCH64
817 int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
818 return v_int32x4(vmlal_high_s16(p, a.val, b.val));
819#else
820 int16x4_t a0 = vget_low_s16(a.val);
821 int16x4_t a1 = vget_high_s16(a.val);
822 int16x4_t b0 = vget_low_s16(b.val);
823 int16x4_t b1 = vget_high_s16(b.val);
824 int32x4_t p = vmlal_s16(c.val, a0, b0);
825 return v_int32x4(vmlal_s16(p, a1, b1));
826#endif
827}
828
829// 32 >> 64
830inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
831{
832#if CV_NEON_AARCH64
833 int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
834 return v_int64x2(vmlal_high_s32(p, a.val, b.val));
835#else
836 int32x2_t a0 = vget_low_s32(a.val);
837 int32x2_t a1 = vget_high_s32(a.val);
838 int32x2_t b0 = vget_low_s32(b.val);
839 int32x2_t b1 = vget_high_s32(b.val);
840 int64x2_t p = vmull_s32(a0, b0);
841 return v_int64x2(vmlal_s32(p, a1, b1));
842#endif
843}
844inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
845{
846#if CV_NEON_AARCH64
847 int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
848 return v_int64x2(vmlal_high_s32(p, a.val, b.val));
849#else
850 int32x2_t a0 = vget_low_s32(a.val);
851 int32x2_t a1 = vget_high_s32(a.val);
852 int32x2_t b0 = vget_low_s32(b.val);
853 int32x2_t b1 = vget_high_s32(b.val);
854 int64x2_t p = vmlal_s32(c.val, a0, b0);
855 return v_int64x2(vmlal_s32(p, a1, b1));
856#endif
857}
858
859// 8 >> 32
860#ifdef CV_NEON_DOT
861#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \
862inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b) \
863{ \
864 return v_dotprod_expand(a, b); \
865} \
866inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
867{ \
868 return v_dotprod_expand(a, b, c); \
869}
870
871OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_uint32x4, v_uint8x16, u32)
872OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_int32x4, v_int8x16, s32)
873#else
874inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
875{
876 uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
877 uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
878 uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
879 uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
880 return v_uint32x4(vaddq_u32(s0, s1));
881}
882inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
883{
884 return v_add(v_dotprod_expand_fast(a, b), c);
885}
886
887inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
888{
889 int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
890 prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
891 return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
892}
893inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
894{
895 return v_add(v_dotprod_expand_fast(a, b), c);
896}
897#endif
898
899// 16 >> 64
900inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
901{
902 uint32x4_t p0 = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
903 uint32x4_t p1 = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
904 uint64x2_t s0 = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
905 uint64x2_t s1 = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
906 return v_uint64x2(vaddq_u64(s0, s1));
907}
908inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
909{ return v_add(v_dotprod_expand_fast(a, b), c); }
910
911inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
912{
913 int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
914 prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
915 return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
916}
917inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
918{ return v_add(v_dotprod_expand_fast(a, b), c); }
919
920// 32 >> 64f
921#if CV_SIMD128_64F
922inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
923{ return v_cvt_f64(v_dotprod_fast(a, b)); }
924inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
925{ return v_add(v_dotprod_expand_fast(a, b), c); }
926#endif
927
928
929#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
930 OPENCV_HAL_IMPL_NEON_BIN_OP(v_and, _Tpvec, vandq_##suffix) \
931 OPENCV_HAL_IMPL_NEON_BIN_OP(v_or, _Tpvec, vorrq_##suffix) \
932 OPENCV_HAL_IMPL_NEON_BIN_OP(v_xor, _Tpvec, veorq_##suffix) \
933 inline _Tpvec v_not (const _Tpvec& a) \
934 { \
935 return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
936 }
937
938OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
939OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
940OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
941OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
942OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
943OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
944OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
945OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
946
947#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
948inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \
949{ \
950 return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
951}
952
953OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_and, vandq_s32)
954OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_or, vorrq_s32)
955OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_xor, veorq_s32)
956
957inline v_float32x4 v_not (const v_float32x4& a)
958{
959 return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
960}
961
962#if CV_SIMD128_64F
963inline v_float32x4 v_sqrt(const v_float32x4& x)
964{
965 return v_float32x4(vsqrtq_f32(x.val));
966}
967
968inline v_float32x4 v_invsqrt(const v_float32x4& x)
969{
970 v_float32x4 one = v_setall_f32(1.0f);
971 return v_div(one, v_sqrt(x));
972}
973#else
974inline v_float32x4 v_sqrt(const v_float32x4& x)
975{
976 float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
977 float32x4_t e = vrsqrteq_f32(x1);
978 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
979 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
980 return v_float32x4(vmulq_f32(x.val, e));
981}
982
983inline v_float32x4 v_invsqrt(const v_float32x4& x)
984{
985 float32x4_t e = vrsqrteq_f32(x.val);
986 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
987 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
988 return v_float32x4(e);
989}
990#endif
991
992#define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
993inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
994
995OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
996OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
997OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
998
999inline v_float32x4 v_abs(v_float32x4 x)
1000{ return v_float32x4(vabsq_f32(x.val)); }
1001
1002#if CV_SIMD128_64F
1003#define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
1004inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \
1005{ \
1006 return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
1007}
1008
1009OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_and, vandq_s64)
1010OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_or, vorrq_s64)
1011OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_xor, veorq_s64)
1012
1013inline v_float64x2 v_not (const v_float64x2& a)
1014{
1015 return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
1016}
1017
1018inline v_float64x2 v_sqrt(const v_float64x2& x)
1019{
1020 return v_float64x2(vsqrtq_f64(x.val));
1021}
1022
1023inline v_float64x2 v_invsqrt(const v_float64x2& x)
1024{
1025 v_float64x2 one = v_setall_f64(1.0f);
1026 return v_div(one, v_sqrt(x));
1027}
1028
1029inline v_float64x2 v_abs(v_float64x2 x)
1030{ return v_float64x2(vabsq_f64(x.val)); }
1031#endif
1032
1033// TODO: exp, log, sin, cos
1034
1035#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
1036inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1037{ \
1038 return _Tpvec(intrin(a.val, b.val)); \
1039}
1040
1041OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
1042OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
1043OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
1044OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
1045OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
1046OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
1047OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
1048OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
1049OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
1050OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
1051OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
1052OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
1053OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
1054OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
1055#if CV_SIMD128_64F
1056OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
1057OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
1058#endif
1059
1060#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
1061inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
1062{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
1063inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
1064{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
1065inline _Tpvec v_lt (const _Tpvec& a, const _Tpvec& b) \
1066{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
1067inline _Tpvec v_gt (const _Tpvec& a, const _Tpvec& b) \
1068{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
1069inline _Tpvec v_le (const _Tpvec& a, const _Tpvec& b) \
1070{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
1071inline _Tpvec v_ge (const _Tpvec& a, const _Tpvec& b) \
1072{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
1073
1074OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
1075OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
1076OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
1077OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
1078OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
1079OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
1080OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
1081#if defined(__aarch64__) || defined(_M_ARM64)
1082static inline uint64x2_t vmvnq_u64(uint64x2_t a)
1083{
1084 uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
1085 return veorq_u64(a, vx);
1086}
1087//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
1088//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
1089static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
1090{ return v_uint64x2(vceqq_u64(a.val, b.val)); }
1091static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
1092{ return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
1093static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
1094{ return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
1095static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
1096{ return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
1097#else
1098static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
1099{
1100 uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
1101 uint32x4_t swapped = vrev64q_u32(cmp);
1102 return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
1103}
1104static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
1105{
1106 uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
1107 uint32x4_t swapped = vrev64q_u32(cmp);
1108 uint64x2_t v_eq = vreinterpretq_u64_u32(vandq_u32(cmp, swapped));
1109 uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
1110 return v_uint64x2(veorq_u64(v_eq, vx));
1111}
1112static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
1113{
1114 return v_reinterpret_as_s64(v_eq(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
1115}
1116static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
1117{
1118 return v_reinterpret_as_s64(v_ne(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
1119}
1120#endif
1121#if CV_SIMD128_64F
1122OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
1123#endif
1124
1125inline v_float32x4 v_not_nan(const v_float32x4& a)
1126{ return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
1127#if CV_SIMD128_64F
1128inline v_float64x2 v_not_nan(const v_float64x2& a)
1129{ return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
1130#endif
1131
1132OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
1133OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
1134OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
1135OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
1136OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
1137OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
1138OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
1139OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
1140OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
1141OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
1142OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
1143OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
1144
1145OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
1146OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
1147OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
1148OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
1149#if CV_SIMD128_64F
1150OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
1151#endif
1152
1154inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1155{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
1156inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1157{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
1158
1159#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
1160inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
1161{ \
1162 return _Tpvec2(cast(intrin(a.val, b.val))); \
1163}
1164
1165OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
1166OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
1167OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
1168
1169inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1170{
1171 v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1172 return v_sqrt(x);
1173}
1174
1175inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1176{
1177 return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1178}
1179
1180inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1181{
1182#if CV_SIMD128_64F
1183 // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
1184 // also adds FMA support both for single- and double-precision floating-point vectors
1185 return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
1186#else
1187 return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
1188#endif
1189}
1190
1191inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1192{
1193 return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
1194}
1195
1196inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1197{
1198 return v_fma(a, b, c);
1199}
1200
1201inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1202{
1203 return v_fma(a, b, c);
1204}
1205
1206#if CV_SIMD128_64F
1207inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1208{
1209 v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1210 return v_sqrt(x);
1211}
1212
1213inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1214{
1215 return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1216}
1217
1218inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1219{
1220 return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
1221}
1222
1223inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1224{
1225 return v_fma(a, b, c);
1226}
1227#endif
1228
1229// trade efficiency for convenience
1230#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
1231inline _Tpvec v_shl (const _Tpvec& a, int n) \
1232{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
1233inline _Tpvec v_shr (const _Tpvec& a, int n) \
1234{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
1235template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1236{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
1237template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1238{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
1239template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1240{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
1241
1242OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
1243OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
1244OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
1245OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
1246OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
1247OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
1248OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
1249OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
1250
1251#define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
1252template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1253{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
1254template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1255{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, VTraits<_Tpvec>::nlanes - n)); } \
1256template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1257{ return a; } \
1258template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1259{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
1260template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1261{ return _Tpvec(vextq_##suffix(b.val, a.val, VTraits<_Tpvec>::nlanes - n)); } \
1262template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1263{ CV_UNUSED(b); return a; }
1264
1265OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
1266OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
1267OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
1268OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
1269OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
1270OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
1271OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
1272OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
1273OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
1274#if CV_SIMD128_64F
1275OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
1276#endif
1277
1278#if defined(__clang__) && defined(__aarch64__)
1279// avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
1280#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1281inline _Tpvec v_load_low(const _Tp* ptr) \
1282{ \
1283typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1284uint64 v = *(unaligned_uint64*)ptr; \
1285return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
1286}
1287#else
1288#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1289inline _Tpvec v_load_low(const _Tp* ptr) \
1290{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
1291#endif
1292
1293#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
1294inline _Tpvec v_load(const _Tp* ptr) \
1295{ return _Tpvec(vld1q_##suffix(ptr)); } \
1296inline _Tpvec v_load_aligned(const _Tp* ptr) \
1297{ return _Tpvec(vld1q_##suffix(ptr)); } \
1298OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1299inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1300{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
1301inline void v_store(_Tp* ptr, const _Tpvec& a) \
1302{ vst1q_##suffix(ptr, a.val); } \
1303inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1304{ vst1q_##suffix(ptr, a.val); } \
1305inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1306{ vst1q_##suffix(ptr, a.val); } \
1307inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1308{ vst1q_##suffix(ptr, a.val); } \
1309inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1310{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
1311inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1312{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
1313
1314OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
1315OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
1316OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
1317OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
1318OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
1319OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
1320OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
1321OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
1322OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
1323#if CV_SIMD128_64F
1324OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
1325#endif
1326
1327inline unsigned v_reduce_sum(const v_uint8x16& a)
1328{
1329#if CV_NEON_AARCH64
1330 uint16_t t0 = vaddlvq_u8(a.val);
1331 return t0;
1332#else // #if CV_NEON_AARCH64
1333 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
1334 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1335 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1336#endif // #if CV_NEON_AARCH64
1337}
1338inline int v_reduce_sum(const v_int8x16& a)
1339{
1340#if CV_NEON_AARCH64
1341 int16_t t0 = vaddlvq_s8(a.val);
1342 return t0;
1343#else // #if CV_NEON_AARCH64
1344 int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
1345 int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1346 return vget_lane_s32(vpadd_s32(t1, t1), 0);
1347#endif // #if CV_NEON_AARCH64
1348}
1349inline unsigned v_reduce_sum(const v_uint16x8& a)
1350{
1351#if CV_NEON_AARCH64
1352 uint32_t t0 = vaddlvq_u16(a.val);
1353 return t0;
1354#else // #if CV_NEON_AARCH64
1355 uint32x4_t t0 = vpaddlq_u16(a.val);
1356 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1357 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1358#endif // #if CV_NEON_AARCH64
1359}
1360inline int v_reduce_sum(const v_int16x8& a)
1361{
1362#if CV_NEON_AARCH64
1363 int32_t t0 = vaddlvq_s16(a.val);
1364 return t0;
1365#else // #if CV_NEON_AARCH64
1366 int32x4_t t0 = vpaddlq_s16(a.val);
1367 int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1368 return vget_lane_s32(vpadd_s32(t1, t1), 0);
1369#endif // #if CV_NEON_AARCH64
1370}
1371
1372#if CV_NEON_AARCH64
1373#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1374inline scalartype v_reduce_##func(const _Tpvec& a) \
1375{ \
1376 return v##vectorfunc##vq_##suffix(a.val); \
1377}
1378#else // #if CV_NEON_AARCH64
1379#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1380inline scalartype v_reduce_##func(const _Tpvec& a) \
1381{ \
1382 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1383 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1384 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1385 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1386}
1387#endif // #if CV_NEON_AARCH64
1388
1389OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
1390OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
1391OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
1392OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
1393
1394#if CV_NEON_AARCH64
1395#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1396inline scalartype v_reduce_##func(const _Tpvec& a) \
1397{ \
1398 return v##vectorfunc##vq_##suffix(a.val); \
1399}
1400#else // #if CV_NEON_AARCH64
1401#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1402inline scalartype v_reduce_##func(const _Tpvec& a) \
1403{ \
1404 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1405 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1406 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1407}
1408#endif // #if CV_NEON_AARCH64
1409
1410OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
1411OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
1412OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
1413OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
1414
1415#if CV_NEON_AARCH64
1416#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1417inline scalartype v_reduce_##func(const _Tpvec& a) \
1418{ \
1419 return v##vectorfunc##vq_##suffix(a.val); \
1420}
1421#else // #if CV_NEON_AARCH64
1422#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1423inline scalartype v_reduce_##func(const _Tpvec& a) \
1424{ \
1425 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1426 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
1427}
1428#endif // #if CV_NEON_AARCH64
1429
1430OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
1431OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
1432OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
1433OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
1434OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
1435OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
1436OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
1437OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
1438OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
1439
1440inline uint64 v_reduce_sum(const v_uint64x2& a)
1441{
1442#if CV_NEON_AARCH64
1443 return vaddvq_u64(a.val);
1444#else // #if CV_NEON_AARCH64
1445 return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
1446#endif // #if CV_NEON_AARCH64
1447}
1448inline int64 v_reduce_sum(const v_int64x2& a)
1449{
1450#if CV_NEON_AARCH64
1451 return vaddvq_s64(a.val);
1452#else // #if CV_NEON_AARCH64
1453 return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
1454#endif // #if CV_NEON_AARCH64
1455}
1456#if CV_SIMD128_64F
1457inline double v_reduce_sum(const v_float64x2& a)
1458{
1459 return vaddvq_f64(a.val);
1460}
1461#endif
1462
1463inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1464 const v_float32x4& c, const v_float32x4& d)
1465{
1466#if CV_NEON_AARCH64
1467 float32x4_t ab = vpaddq_f32(a.val, b.val); // a0+a1 a2+a3 b0+b1 b2+b3
1468 float32x4_t cd = vpaddq_f32(c.val, d.val); // c0+c1 d0+d1 c2+c3 d2+d3
1469 return v_float32x4(vpaddq_f32(ab, cd)); // sumA sumB sumC sumD
1470#else // #if CV_NEON_AARCH64
1471 float32x4x2_t ab = vtrnq_f32(a.val, b.val);
1472 float32x4x2_t cd = vtrnq_f32(c.val, d.val);
1473
1474 float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
1475 float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
1476
1477 float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
1478 float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
1479
1480 return v_float32x4(vaddq_f32(v0, v1));
1481#endif // #if CV_NEON_AARCH64
1482}
1483
1484inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1485{
1486#if CV_NEON_AARCH64
1487 uint8x16_t t0 = vabdq_u8(a.val, b.val);
1488 uint16_t t1 = vaddlvq_u8(t0);
1489 return t1;
1490#else // #if CV_NEON_AARCH64
1491 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
1492 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1493 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1494#endif // #if CV_NEON_AARCH64
1495}
1496inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1497{
1498#if CV_NEON_AARCH64
1499 uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
1500 uint16_t t1 = vaddlvq_u8(t0);
1501 return t1;
1502#else // #if CV_NEON_AARCH64
1503 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
1504 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1505 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1506#endif // #if CV_NEON_AARCH64
1507}
1508inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1509{
1510#if CV_NEON_AARCH64
1511 uint16x8_t t0 = vabdq_u16(a.val, b.val);
1512 uint32_t t1 = vaddlvq_u16(t0);
1513 return t1;
1514#else // #if CV_NEON_AARCH64
1515 uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
1516 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1517 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1518#endif // #if CV_NEON_AARCH64
1519}
1520inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1521{
1522#if CV_NEON_AARCH64
1523 uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
1524 uint32_t t1 = vaddlvq_u16(t0);
1525 return t1;
1526#else // #if CV_NEON_AARCH64
1527 uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
1528 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1529 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1530#endif // #if CV_NEON_AARCH64
1531}
1532inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1533{
1534#if CV_NEON_AARCH64
1535 uint32x4_t t0 = vabdq_u32(a.val, b.val);
1536 uint32_t t1 = vaddvq_u32(t0);
1537 return t1;
1538#else // #if CV_NEON_AARCH64
1539 uint32x4_t t0 = vabdq_u32(a.val, b.val);
1540 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1541 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1542#endif // #if CV_NEON_AARCH64
1543}
1544inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1545{
1546#if CV_NEON_AARCH64
1547 uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1548 uint32_t t1 = vaddvq_u32(t0);
1549 return t1;
1550#else // #if CV_NEON_AARCH64
1551 uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1552 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1553 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1554#endif // #if CV_NEON_AARCH64
1555}
1556inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1557{
1558#if CV_NEON_AARCH64
1559 float32x4_t t0 = vabdq_f32(a.val, b.val);
1560 return vaddvq_f32(t0);
1561#else // #if CV_NEON_AARCH64
1562 float32x4_t t0 = vabdq_f32(a.val, b.val);
1563 float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
1564 return vget_lane_f32(vpadd_f32(t1, t1), 0);
1565#endif // #if CV_NEON_AARCH64
1566}
1567
1568inline v_uint8x16 v_popcount(const v_uint8x16& a)
1569{ return v_uint8x16(vcntq_u8(a.val)); }
1570inline v_uint8x16 v_popcount(const v_int8x16& a)
1571{ return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
1572inline v_uint16x8 v_popcount(const v_uint16x8& a)
1573{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
1574inline v_uint16x8 v_popcount(const v_int16x8& a)
1575{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
1576inline v_uint32x4 v_popcount(const v_uint32x4& a)
1577{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
1578inline v_uint32x4 v_popcount(const v_int32x4& a)
1579{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
1580inline v_uint64x2 v_popcount(const v_uint64x2& a)
1581{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
1582inline v_uint64x2 v_popcount(const v_int64x2& a)
1583{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
1584
1585inline int v_signmask(const v_uint8x16& a)
1586{
1587#if CV_NEON_AARCH64
1588 const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
1589 const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
1590 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
1591 uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
1592 uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
1593 return t0;
1594#else // #if CV_NEON_AARCH64
1595 int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
1596 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
1597 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
1598 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
1599#endif // #if CV_NEON_AARCH64
1600}
1601
1602inline int v_signmask(const v_int8x16& a)
1603{ return v_signmask(v_reinterpret_as_u8(a)); }
1604
1605inline int v_signmask(const v_uint16x8& a)
1606{
1607#if CV_NEON_AARCH64
1608 const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
1609 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
1610 uint32_t t0 = vaddlvq_u16(v0);
1611 return t0;
1612#else // #if CV_NEON_AARCH64
1613 int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
1614 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
1615 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
1616 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
1617#endif // #if CV_NEON_AARCH64
1618}
1619inline int v_signmask(const v_int16x8& a)
1620{ return v_signmask(v_reinterpret_as_u16(a)); }
1621
1622inline int v_signmask(const v_uint32x4& a)
1623{
1624#if CV_NEON_AARCH64
1625 const int32x4_t signPosition = {0,1,2,3};
1626 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
1627 uint32_t t0 = vaddvq_u32(v0);
1628 return t0;
1629#else // #if CV_NEON_AARCH64
1630 int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
1631 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
1632 uint64x2_t v1 = vpaddlq_u32(v0);
1633 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
1634#endif // #if CV_NEON_AARCH64
1635}
1636inline int v_signmask(const v_int32x4& a)
1637{ return v_signmask(v_reinterpret_as_u32(a)); }
1638inline int v_signmask(const v_float32x4& a)
1639{ return v_signmask(v_reinterpret_as_u32(a)); }
1640inline int v_signmask(const v_uint64x2& a)
1641{
1642#if CV_NEON_AARCH64
1643 const int64x2_t signPosition = {0,1};
1644 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
1645 int t0 = (int)vaddvq_u64(v0);
1646 return t0;
1647#else // #if CV_NEON_AARCH64
1648 int64x1_t m0 = vdup_n_s64(0);
1649 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1650 return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1651#endif // #if CV_NEON_AARCH64
1652}
1653inline int v_signmask(const v_int64x2& a)
1654{ return v_signmask(v_reinterpret_as_u64(a)); }
1655#if CV_SIMD128_64F
1656inline int v_signmask(const v_float64x2& a)
1657{ return v_signmask(v_reinterpret_as_u64(a)); }
1658#endif
1659
1660inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
1661inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
1662inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
1663inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
1664inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
1665inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
1666inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1667inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1668inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1669#if CV_SIMD128_64F
1670inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1671#endif
1672
1673#if CV_NEON_AARCH64
1674 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1675 inline bool v_check_all(const v_##_Tpvec& a) \
1676 { \
1677 return (vminvq_##suffix(a.val) >> shift) != 0; \
1678 } \
1679 inline bool v_check_any(const v_##_Tpvec& a) \
1680 { \
1681 return (vmaxvq_##suffix(a.val) >> shift) != 0; \
1682 }
1683#else // #if CV_NEON_AARCH64
1684 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1685 inline bool v_check_all(const v_##_Tpvec& a) \
1686 { \
1687 _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1688 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1689 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1690 } \
1691 inline bool v_check_any(const v_##_Tpvec& a) \
1692 { \
1693 _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1694 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1695 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1696 }
1697#endif // #if CV_NEON_AARCH64
1698
1699OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1700OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1701OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1702
1703inline bool v_check_all(const v_uint64x2& a)
1704{
1705 uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1706 return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
1707}
1708inline bool v_check_any(const v_uint64x2& a)
1709{
1710 uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1711 return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
1712}
1713
1714inline bool v_check_all(const v_int8x16& a)
1715{ return v_check_all(v_reinterpret_as_u8(a)); }
1716inline bool v_check_all(const v_int16x8& a)
1717{ return v_check_all(v_reinterpret_as_u16(a)); }
1718inline bool v_check_all(const v_int32x4& a)
1719{ return v_check_all(v_reinterpret_as_u32(a)); }
1720inline bool v_check_all(const v_float32x4& a)
1721{ return v_check_all(v_reinterpret_as_u32(a)); }
1722
1723inline bool v_check_any(const v_int8x16& a)
1724{ return v_check_any(v_reinterpret_as_u8(a)); }
1725inline bool v_check_any(const v_int16x8& a)
1726{ return v_check_any(v_reinterpret_as_u16(a)); }
1727inline bool v_check_any(const v_int32x4& a)
1728{ return v_check_any(v_reinterpret_as_u32(a)); }
1729inline bool v_check_any(const v_float32x4& a)
1730{ return v_check_any(v_reinterpret_as_u32(a)); }
1731
1732inline bool v_check_all(const v_int64x2& a)
1733{ return v_check_all(v_reinterpret_as_u64(a)); }
1734inline bool v_check_any(const v_int64x2& a)
1735{ return v_check_any(v_reinterpret_as_u64(a)); }
1736#if CV_SIMD128_64F
1737inline bool v_check_all(const v_float64x2& a)
1738{ return v_check_all(v_reinterpret_as_u64(a)); }
1739inline bool v_check_any(const v_float64x2& a)
1740{ return v_check_any(v_reinterpret_as_u64(a)); }
1741#endif
1742
1743#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1744inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1745{ \
1746 return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1747}
1748
1749OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
1750OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
1751OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
1752OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
1753OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
1754OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
1755OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
1756#if CV_SIMD128_64F
1757OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
1758#endif
1759
1760#if CV_NEON_AARCH64
1761#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1762inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1763{ \
1764 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1765 b1.val = vmovl_high_##suffix(a.val); \
1766} \
1767inline _Tpwvec v_expand_low(const _Tpvec& a) \
1768{ \
1769 return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1770} \
1771inline _Tpwvec v_expand_high(const _Tpvec& a) \
1772{ \
1773 return _Tpwvec(vmovl_high_##suffix(a.val)); \
1774} \
1775inline _Tpwvec v_load_expand(const _Tp* ptr) \
1776{ \
1777 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1778}
1779#else
1780#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1781inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1782{ \
1783 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1784 b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1785} \
1786inline _Tpwvec v_expand_low(const _Tpvec& a) \
1787{ \
1788 return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1789} \
1790inline _Tpwvec v_expand_high(const _Tpvec& a) \
1791{ \
1792 return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
1793} \
1794inline _Tpwvec v_load_expand(const _Tp* ptr) \
1795{ \
1796 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1797}
1798#endif
1799
1800OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
1801OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
1802OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
1803OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
1804OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
1805OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
1806
1807inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1808{
1809 typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1810 uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
1811 uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1812 return v_uint32x4(vmovl_u16(v1));
1813}
1814
1815inline v_int32x4 v_load_expand_q(const schar* ptr)
1816{
1817 typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
1818 int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
1819 int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1820 return v_int32x4(vmovl_s16(v1));
1821}
1822
1823#if defined(__aarch64__) || defined(_M_ARM64)
1824#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1825inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1826{ \
1827 b0.val = vzip1q_##suffix(a0.val, a1.val); \
1828 b1.val = vzip2q_##suffix(a0.val, a1.val); \
1829} \
1830inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1831{ \
1832 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1833} \
1834inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1835{ \
1836 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1837} \
1838inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1839{ \
1840 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1841 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1842}
1843#else
1844#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1845inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1846{ \
1847 _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1848 b0.val = p.val[0]; \
1849 b1.val = p.val[1]; \
1850} \
1851inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1852{ \
1853 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1854} \
1855inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1856{ \
1857 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1858} \
1859inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1860{ \
1861 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1862 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1863}
1864#endif
1865
1866OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1867OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1868OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1869OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1870OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1871OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1872OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1873#if CV_SIMD128_64F
1874OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1875#endif
1876
1877inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1878{
1879 uint8x16_t vec = vrev64q_u8(a.val);
1880 return v_uint8x16(vextq_u8(vec, vec, 8));
1881}
1882
1883inline v_int8x16 v_reverse(const v_int8x16 &a)
1884{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1885
1886inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1887{
1888 uint16x8_t vec = vrev64q_u16(a.val);
1889 return v_uint16x8(vextq_u16(vec, vec, 4));
1890}
1891
1892inline v_int16x8 v_reverse(const v_int16x8 &a)
1893{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1894
1895inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1896{
1897 uint32x4_t vec = vrev64q_u32(a.val);
1898 return v_uint32x4(vextq_u32(vec, vec, 2));
1899}
1900
1901inline v_int32x4 v_reverse(const v_int32x4 &a)
1902{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1903
1904inline v_float32x4 v_reverse(const v_float32x4 &a)
1905{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1906
1907inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1908{
1909 uint64x2_t vec = a.val;
1910 uint64x1_t vec_lo = vget_low_u64(vec);
1911 uint64x1_t vec_hi = vget_high_u64(vec);
1912 return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
1913}
1914
1915inline v_int64x2 v_reverse(const v_int64x2 &a)
1916{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1917
1918#if CV_SIMD128_64F
1919inline v_float64x2 v_reverse(const v_float64x2 &a)
1920{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1921#endif
1922
1923#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1924template <int s> \
1925inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1926{ \
1927 return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1928}
1929
1930OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1931OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1932OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1933OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1934OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1935OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1936OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1937OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1938OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1939#if CV_SIMD128_64F
1940OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1941#endif
1942
1943#define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
1944template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
1945
1946OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
1947OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
1948OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
1949OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
1950OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
1951OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
1952OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
1953OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
1954OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
1955#if CV_SIMD128_64F
1956OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
1957#endif
1958
1959#define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
1960template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
1961
1962OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
1963OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
1964OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
1965OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
1966OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
1967OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
1968OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
1969OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
1970OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
1971#if CV_SIMD128_64F
1972OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
1973#endif
1974
1975#if CV_SIMD128_64F
1976inline v_int32x4 v_round(const v_float32x4& a)
1977{
1978 float32x4_t a_ = a.val;
1979 int32x4_t result;
1980#if defined _MSC_VER
1981 result = vcvtnq_s32_f32(a_);
1982#else
1983 __asm__ ("fcvtns %0.4s, %1.4s"
1984 : "=w"(result)
1985 : "w"(a_)
1986 : /* No clobbers */);
1987#endif
1988 return v_int32x4(result);
1989}
1990#else
1991inline v_int32x4 v_round(const v_float32x4& a)
1992{
1993 // See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
1994 float32x4_t delta = vdupq_n_f32(12582912.0f);
1995 return v_int32x4(vcvtq_s32_f32(vsubq_f32(vaddq_f32(a.val, delta), delta)));
1996}
1997#endif
1998inline v_int32x4 v_floor(const v_float32x4& a)
1999{
2000 int32x4_t a1 = vcvtq_s32_f32(a.val);
2001 uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
2002 return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
2003}
2004
2005inline v_int32x4 v_ceil(const v_float32x4& a)
2006{
2007 int32x4_t a1 = vcvtq_s32_f32(a.val);
2008 uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
2009 return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
2010}
2011
2012inline v_int32x4 v_trunc(const v_float32x4& a)
2013{ return v_int32x4(vcvtq_s32_f32(a.val)); }
2014
2015#if CV_SIMD128_64F
2016inline v_int32x4 v_round(const v_float64x2& a)
2017{
2018 static const int32x2_t zero = vdup_n_s32(0);
2019 return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), zero));
2020}
2021
2022inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2023{
2024 return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), vmovn_s64(vcvtnq_s64_f64(b.val))));
2025}
2026
2027inline v_int32x4 v_floor(const v_float64x2& a)
2028{
2029 static const int32x2_t zero = vdup_n_s32(0);
2030 int64x2_t a1 = vcvtq_s64_f64(a.val);
2031 uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
2032 a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
2033 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
2034}
2035
2036inline v_int32x4 v_ceil(const v_float64x2& a)
2037{
2038 static const int32x2_t zero = vdup_n_s32(0);
2039 int64x2_t a1 = vcvtq_s64_f64(a.val);
2040 uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
2041 a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
2042 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
2043}
2044
2045inline v_int32x4 v_trunc(const v_float64x2& a)
2046{
2047 static const int32x2_t zero = vdup_n_s32(0);
2048 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
2049}
2050#endif
2051
2052#if CV_NEON_AARCH64
2053#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2054inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2055 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2056 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2057 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2058{ \
2059 /* -- Pass 1: 64b transpose */ \
2060 _Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
2061 vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2062 vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2063 _Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
2064 vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2065 vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2066 _Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
2067 vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2068 vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2069 _Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
2070 vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2071 vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2072 /* -- Pass 2: 32b transpose */ \
2073 b0.val = vtrn1q_##suffix##32(t0, t1); \
2074 b1.val = vtrn2q_##suffix##32(t0, t1); \
2075 b2.val = vtrn1q_##suffix##32(t2, t3); \
2076 b3.val = vtrn2q_##suffix##32(t2, t3); \
2077}
2078
2079OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
2080OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
2081OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
2082#else // #if CV_NEON_AARCH64
2083#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2084inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2085 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2086 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2087 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2088{ \
2089 /* m00 m01 m02 m03 */ \
2090 /* m10 m11 m12 m13 */ \
2091 /* m20 m21 m22 m23 */ \
2092 /* m30 m31 m32 m33 */ \
2093 _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
2094 _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
2095 /* m00 m10 m02 m12 */ \
2096 /* m01 m11 m03 m13 */ \
2097 /* m20 m30 m22 m32 */ \
2098 /* m21 m31 m23 m33 */ \
2099 b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
2100 b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
2101 b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
2102 b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
2103}
2104
2105OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
2106OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
2107OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
2108#endif // #if CV_NEON_AARCH64
2109
2110#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
2111inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2112{ \
2113 _Tpvec##x2_t v = vld2q_##suffix(ptr); \
2114 a.val = v.val[0]; \
2115 b.val = v.val[1]; \
2116} \
2117inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2118{ \
2119 _Tpvec##x3_t v = vld3q_##suffix(ptr); \
2120 a.val = v.val[0]; \
2121 b.val = v.val[1]; \
2122 c.val = v.val[2]; \
2123} \
2124inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2125 v_##_Tpvec& c, v_##_Tpvec& d) \
2126{ \
2127 _Tpvec##x4_t v = vld4q_##suffix(ptr); \
2128 a.val = v.val[0]; \
2129 b.val = v.val[1]; \
2130 c.val = v.val[2]; \
2131 d.val = v.val[3]; \
2132} \
2133inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2134 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2135{ \
2136 _Tpvec##x2_t v; \
2137 v.val[0] = a.val; \
2138 v.val[1] = b.val; \
2139 vst2q_##suffix(ptr, v); \
2140} \
2141inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2142 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2143{ \
2144 _Tpvec##x3_t v; \
2145 v.val[0] = a.val; \
2146 v.val[1] = b.val; \
2147 v.val[2] = c.val; \
2148 vst3q_##suffix(ptr, v); \
2149} \
2150inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2151 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2152 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2153{ \
2154 _Tpvec##x4_t v; \
2155 v.val[0] = a.val; \
2156 v.val[1] = b.val; \
2157 v.val[2] = c.val; \
2158 v.val[3] = d.val; \
2159 vst4q_##suffix(ptr, v); \
2160}
2161
2162#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
2163inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
2164{ \
2165 tp##x1_t a0 = vld1_##suffix(ptr); \
2166 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2167 tp##x1_t a1 = vld1_##suffix(ptr + 2); \
2168 tp##x1_t b1 = vld1_##suffix(ptr + 3); \
2169 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2170 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2171} \
2172 \
2173inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
2174 v_##tp##x2& b, v_##tp##x2& c ) \
2175{ \
2176 tp##x1_t a0 = vld1_##suffix(ptr); \
2177 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2178 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2179 tp##x1_t a1 = vld1_##suffix(ptr + 3); \
2180 tp##x1_t b1 = vld1_##suffix(ptr + 4); \
2181 tp##x1_t c1 = vld1_##suffix(ptr + 5); \
2182 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2183 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2184 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2185} \
2186 \
2187inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
2188 v_##tp##x2& c, v_##tp##x2& d ) \
2189{ \
2190 tp##x1_t a0 = vld1_##suffix(ptr); \
2191 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2192 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2193 tp##x1_t d0 = vld1_##suffix(ptr + 3); \
2194 tp##x1_t a1 = vld1_##suffix(ptr + 4); \
2195 tp##x1_t b1 = vld1_##suffix(ptr + 5); \
2196 tp##x1_t c1 = vld1_##suffix(ptr + 6); \
2197 tp##x1_t d1 = vld1_##suffix(ptr + 7); \
2198 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2199 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2200 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2201 d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
2202} \
2203 \
2204inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2205 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2206{ \
2207 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2208 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2209 vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
2210 vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
2211} \
2212 \
2213inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
2214 const v_##tp##x2& b, const v_##tp##x2& c, \
2215 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2216{ \
2217 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2218 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2219 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2220 vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
2221 vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
2222 vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
2223} \
2224 \
2225inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2226 const v_##tp##x2& c, const v_##tp##x2& d, \
2227 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2228{ \
2229 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2230 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2231 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2232 vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
2233 vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
2234 vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
2235 vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
2236 vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
2237}
2238
2239OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
2240OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
2241OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
2242OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
2243OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
2244OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
2245OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
2246#if CV_SIMD128_64F
2247OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
2248#endif
2249
2250OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
2251OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
2252
2253inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2254{
2255 return v_float32x4(vcvtq_f32_s32(a.val));
2256}
2257
2258#if CV_SIMD128_64F
2259inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2260{
2261 float32x2_t zero = vdup_n_f32(0.0f);
2262 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
2263}
2264
2265inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2266{
2267 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
2268}
2269
2270inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2271{
2272 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
2273}
2274
2275inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2276{
2277 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
2278}
2279
2280inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2281{
2282 return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
2283}
2284
2285inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2286{
2287 return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
2288}
2289
2290inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2291{ return v_float64x2(vcvtq_f64_s64(a.val)); }
2292
2293#endif
2294
2296
2297inline v_int8x16 v_lut(const schar* tab, const int* idx)
2298{
2299 schar CV_DECL_ALIGNED(32) elems[16] =
2300 {
2301 tab[idx[ 0]],
2302 tab[idx[ 1]],
2303 tab[idx[ 2]],
2304 tab[idx[ 3]],
2305 tab[idx[ 4]],
2306 tab[idx[ 5]],
2307 tab[idx[ 6]],
2308 tab[idx[ 7]],
2309 tab[idx[ 8]],
2310 tab[idx[ 9]],
2311 tab[idx[10]],
2312 tab[idx[11]],
2313 tab[idx[12]],
2314 tab[idx[13]],
2315 tab[idx[14]],
2316 tab[idx[15]]
2317 };
2318 return v_int8x16(vld1q_s8(elems));
2319}
2320inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
2321{
2322 schar CV_DECL_ALIGNED(32) elems[16] =
2323 {
2324 tab[idx[0]],
2325 tab[idx[0] + 1],
2326 tab[idx[1]],
2327 tab[idx[1] + 1],
2328 tab[idx[2]],
2329 tab[idx[2] + 1],
2330 tab[idx[3]],
2331 tab[idx[3] + 1],
2332 tab[idx[4]],
2333 tab[idx[4] + 1],
2334 tab[idx[5]],
2335 tab[idx[5] + 1],
2336 tab[idx[6]],
2337 tab[idx[6] + 1],
2338 tab[idx[7]],
2339 tab[idx[7] + 1]
2340 };
2341 return v_int8x16(vld1q_s8(elems));
2342}
2343inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2344{
2345 schar CV_DECL_ALIGNED(32) elems[16] =
2346 {
2347 tab[idx[0]],
2348 tab[idx[0] + 1],
2349 tab[idx[0] + 2],
2350 tab[idx[0] + 3],
2351 tab[idx[1]],
2352 tab[idx[1] + 1],
2353 tab[idx[1] + 2],
2354 tab[idx[1] + 3],
2355 tab[idx[2]],
2356 tab[idx[2] + 1],
2357 tab[idx[2] + 2],
2358 tab[idx[2] + 3],
2359 tab[idx[3]],
2360 tab[idx[3] + 1],
2361 tab[idx[3] + 2],
2362 tab[idx[3] + 3]
2363 };
2364 return v_int8x16(vld1q_s8(elems));
2365}
2366inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
2367inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
2368inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
2369
2370inline v_int16x8 v_lut(const short* tab, const int* idx)
2371{
2372 short CV_DECL_ALIGNED(32) elems[8] =
2373 {
2374 tab[idx[0]],
2375 tab[idx[1]],
2376 tab[idx[2]],
2377 tab[idx[3]],
2378 tab[idx[4]],
2379 tab[idx[5]],
2380 tab[idx[6]],
2381 tab[idx[7]]
2382 };
2383 return v_int16x8(vld1q_s16(elems));
2384}
2385inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2386{
2387 short CV_DECL_ALIGNED(32) elems[8] =
2388 {
2389 tab[idx[0]],
2390 tab[idx[0] + 1],
2391 tab[idx[1]],
2392 tab[idx[1] + 1],
2393 tab[idx[2]],
2394 tab[idx[2] + 1],
2395 tab[idx[3]],
2396 tab[idx[3] + 1]
2397 };
2398 return v_int16x8(vld1q_s16(elems));
2399}
2400inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2401{
2402 return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
2403}
2404inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
2405inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
2406inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
2407
2408inline v_int32x4 v_lut(const int* tab, const int* idx)
2409{
2410 int CV_DECL_ALIGNED(32) elems[4] =
2411 {
2412 tab[idx[0]],
2413 tab[idx[1]],
2414 tab[idx[2]],
2415 tab[idx[3]]
2416 };
2417 return v_int32x4(vld1q_s32(elems));
2418}
2419inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2420{
2421 return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
2422}
2423inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2424{
2425 return v_int32x4(vld1q_s32(tab + idx[0]));
2426}
2427inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
2428inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
2429inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
2430
2431inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2432{
2433 return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
2434}
2435inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2436{
2437 return v_int64x2(vld1q_s64(tab + idx[0]));
2438}
2439inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
2440inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2441
2442inline v_float32x4 v_lut(const float* tab, const int* idx)
2443{
2444 float CV_DECL_ALIGNED(32) elems[4] =
2445 {
2446 tab[idx[0]],
2447 tab[idx[1]],
2448 tab[idx[2]],
2449 tab[idx[3]]
2450 };
2451 return v_float32x4(vld1q_f32(elems));
2452}
2453inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
2454{
2455 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64;
2456
2457 uint64 CV_DECL_ALIGNED(32) elems[2] =
2458 {
2459 *(unaligned_uint64*)(tab + idx[0]),
2460 *(unaligned_uint64*)(tab + idx[1])
2461 };
2462 return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
2463}
2464inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
2465{
2466 return v_float32x4(vld1q_f32(tab + idx[0]));
2467}
2468
2469inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2470{
2471 int CV_DECL_ALIGNED(32) elems[4] =
2472 {
2473 tab[vgetq_lane_s32(idxvec.val, 0)],
2474 tab[vgetq_lane_s32(idxvec.val, 1)],
2475 tab[vgetq_lane_s32(idxvec.val, 2)],
2476 tab[vgetq_lane_s32(idxvec.val, 3)]
2477 };
2478 return v_int32x4(vld1q_s32(elems));
2479}
2480
2481inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2482{
2483 unsigned CV_DECL_ALIGNED(32) elems[4] =
2484 {
2485 tab[vgetq_lane_s32(idxvec.val, 0)],
2486 tab[vgetq_lane_s32(idxvec.val, 1)],
2487 tab[vgetq_lane_s32(idxvec.val, 2)],
2488 tab[vgetq_lane_s32(idxvec.val, 3)]
2489 };
2490 return v_uint32x4(vld1q_u32(elems));
2491}
2492
2493inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2494{
2495 float CV_DECL_ALIGNED(32) elems[4] =
2496 {
2497 tab[vgetq_lane_s32(idxvec.val, 0)],
2498 tab[vgetq_lane_s32(idxvec.val, 1)],
2499 tab[vgetq_lane_s32(idxvec.val, 2)],
2500 tab[vgetq_lane_s32(idxvec.val, 3)]
2501 };
2502 return v_float32x4(vld1q_f32(elems));
2503}
2504
2505inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2506{
2507 /*int CV_DECL_ALIGNED(32) idx[4];
2508 v_store(idx, idxvec);
2509
2510 float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
2511 float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
2512
2513 float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
2514 x = v_float32x4(xxyy.val[0]);
2515 y = v_float32x4(xxyy.val[1]);*/
2516 int CV_DECL_ALIGNED(32) idx[4];
2517 v_store_aligned(idx, idxvec);
2518
2519 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2520 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
2521}
2522
2523inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2524{
2525 return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
2526}
2527inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
2528inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2529{
2530 return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
2531}
2532inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2533
2534inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2535{
2536 return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
2537}
2538inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2539inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2540{
2541 int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
2542 return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
2543}
2544inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2545
2546inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2547{
2548 int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
2549 return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
2550}
2551inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2552inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2553
2554inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2555{
2556 return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
2557}
2558inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2559
2560inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2561{
2562 return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
2563}
2564inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2565
2566inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2567inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2568inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2569
2570#if CV_SIMD128_64F
2571inline v_float64x2 v_lut(const double* tab, const int* idx)
2572{
2573 double CV_DECL_ALIGNED(32) elems[2] =
2574 {
2575 tab[idx[0]],
2576 tab[idx[1]]
2577 };
2578 return v_float64x2(vld1q_f64(elems));
2579}
2580
2581inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2582{
2583 return v_float64x2(vld1q_f64(tab + idx[0]));
2584}
2585
2586inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2587{
2588 double CV_DECL_ALIGNED(32) elems[2] =
2589 {
2590 tab[vgetq_lane_s32(idxvec.val, 0)],
2591 tab[vgetq_lane_s32(idxvec.val, 1)],
2592 };
2593 return v_float64x2(vld1q_f64(elems));
2594}
2595
2596inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2597{
2598 int CV_DECL_ALIGNED(32) idx[4];
2599 v_store_aligned(idx, idxvec);
2600
2601 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
2602 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
2603}
2604#endif
2605
2607#if CV_FP16
2608inline v_float32x4 v_load_expand(const hfloat* ptr)
2609{
2610 float16x4_t v =
2611 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
2612 (float16x4_t)vld1_s16((const short*)ptr);
2613 #else
2614 vld1_f16((const __fp16*)ptr);
2615 #endif
2616 return v_float32x4(vcvt_f32_f16(v));
2617}
2618
2619inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2620{
2621 float16x4_t hv = vcvt_f16_f32(v.val);
2622
2623 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
2624 vst1_s16((short*)ptr, (int16x4_t)hv);
2625 #else
2626 vst1_f16((__fp16*)ptr, hv);
2627 #endif
2628}
2629#else
2630inline v_float32x4 v_load_expand(const hfloat* ptr)
2631{
2632 const int N = 4;
2633 float buf[N];
2634 for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2635 return v_load(buf);
2636}
2637
2638inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2639{
2640 const int N = 4;
2641 float buf[N];
2642 v_store(buf, v);
2643 for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
2644}
2645#endif
2646
2647inline void v_cleanup() {}
2648
2649CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2650
2652
2653}
2654
2655#endif
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray T
Definition calib3d.hpp:1867
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr const CvArr CvArr * result
Definition core_c.h:1423
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
#define CV_BIG_UINT(n)
Definition interface.h:64
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition intrin_cpp.hpp:2190
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition intrin_cpp.hpp:1033
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition intrin_cpp.hpp:1020
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition intrin_cpp.hpp:1584
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition intrin_cpp.hpp:1961
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
CvSize int int int CvPoint int delta
Definition imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
OutputArray sum
Definition imgproc.hpp:2882
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441