45 #ifndef OPENCV_HAL_INTRIN_NEON_HPP
46 #define OPENCV_HAL_INTRIN_NEON_HPP
49 #include "opencv2/core/utility.hpp"
56 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
59 #if defined(__aarch64__) || defined(_M_ARM64)
60 #define CV_SIMD128_64F 1
62 #define CV_SIMD128_64F 0
75 #if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
76 #define CV_NEON_AARCH64 1
78 #define CV_NEON_AARCH64 0
85 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
86 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
87 { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
88 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
89 inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
90 { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
92 #define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
93 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
94 { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
95 #define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
96 inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
97 { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
101 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
102 template <typename T> static inline \
103 _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
104 template <typename T> static inline \
105 float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
107 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
110 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
111 OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
112 OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
113 OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
115 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
116 OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
118 #define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
119 OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
121 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8, u8)
122 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16, int8x8, s8)
123 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
124 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16)
125 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
126 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32)
127 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
128 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
129 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
131 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
135 template<
typename T>
struct VTraits {
136 static inline int vlanes() {
return T::nlanes; }
137 enum { max_nlanes = T::nlanes, nlanes = T::nlanes };
138 using lane_type =
typename T::lane_type;
142 inline typename VTraits<T>::lane_type v_get0(
const T& v) \
155 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
162 enum { nlanes = 16 };
163 typedef uchar lane_type;
165 friend typename VTraits<v_uint8x16>::lane_type v_get0<v_uint8x16>(
const v_uint8x16& v);
168 return vgetq_lane_u8(val, 0);
175 explicit v_int8x16(int8x16_t v) : val(v) {}
179 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
186 enum { nlanes = 16 };
187 typedef schar lane_type;
189 friend typename VTraits<v_int8x16>::lane_type v_get0<v_int8x16>(
const v_int8x16& v);
192 return vgetq_lane_s8(val, 0);
202 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
212 friend typename VTraits<v_uint16x8>::lane_type v_get0<v_uint16x8>(
const v_uint16x8& v);
215 return vgetq_lane_u16(val, 0);
222 explicit v_int16x8(int16x8_t v) : val(v) {}
223 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
225 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
233 typedef short lane_type;
235 friend typename VTraits<v_int16x8>::lane_type v_get0<v_int16x8>(
const v_int16x8& v);
238 return vgetq_lane_s16(val, 0);
246 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
248 unsigned v[] = {v0, v1, v2, v3};
256 typedef unsigned lane_type;
258 friend typename VTraits<v_uint32x4>::lane_type v_get0<v_uint32x4>(
const v_uint32x4& v);
259 unsigned get0()
const
261 return vgetq_lane_u32(val, 0);
268 explicit v_int32x4(int32x4_t v) : val(v) {}
269 v_int32x4(
int v0,
int v1,
int v2,
int v3)
271 int v[] = {v0, v1, v2, v3};
279 typedef int lane_type;
281 friend typename VTraits<v_int32x4>::lane_type v_get0<v_int32x4>(
const v_int32x4& v);
284 return vgetq_lane_s32(val, 0);
292 v_float32x4(
float v0,
float v1,
float v2,
float v3)
294 float v[] = {v0, v1, v2, v3};
302 typedef float lane_type;
304 friend typename VTraits<v_float32x4>::lane_type v_get0<v_float32x4>(
const v_float32x4& v);
307 return vgetq_lane_f32(val, 0);
326 friend typename VTraits<v_uint64x2>::lane_type v_get0<v_uint64x2>(
const v_uint64x2& v);
329 return vgetq_lane_u64(val, 0);
336 explicit v_int64x2(int64x2_t v) : val(v) {}
339 int64 v[] = {v0, v1};
347 typedef int64 lane_type;
349 friend typename VTraits<v_int64x2>::lane_type v_get0<v_int64x2>(
const v_int64x2& v);
352 return vgetq_lane_s64(val, 0);
363 double v[] = {v0, v1};
371 typedef double lane_type;
373 friend typename VTraits<v_float64x2>::lane_type v_get0<v_float64x2>(
const v_float64x2& v);
376 return vgetq_lane_f64(val, 0);
381 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
382 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
383 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
384 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
385 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
386 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
387 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
388 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
389 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
390 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
391 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
392 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
393 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
395 OPENCV_HAL_IMPL_NEON_INIT(uint8x16,
uchar, u8)
396 OPENCV_HAL_IMPL_NEON_INIT(int8x16,
schar, s8)
397 OPENCV_HAL_IMPL_NEON_INIT(uint16x8,
ushort, u16)
398 OPENCV_HAL_IMPL_NEON_INIT(int16x8,
short, s16)
399 OPENCV_HAL_IMPL_NEON_INIT(uint32x4,
unsigned, u32)
400 OPENCV_HAL_IMPL_NEON_INIT(int32x4,
int, s32)
401 OPENCV_HAL_IMPL_NEON_INIT(uint64x2,
uint64, u64)
402 OPENCV_HAL_IMPL_NEON_INIT(int64x2,
int64, s64)
403 OPENCV_HAL_IMPL_NEON_INIT(float32x4,
float, f32)
405 #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
406 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
407 OPENCV_HAL_IMPL_NEON_INIT(float64x2,
double, f64)
408 OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
409 OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
410 OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
411 OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
412 OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
413 OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
414 OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
415 OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
416 OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
417 OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
420 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
421 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
423 hreg a1 = mov(a.val), b1 = mov(b.val); \
424 return _Tpvec(vcombine_##suffix(a1, b1)); \
426 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
428 hreg a1 = mov(a.val); \
429 vst1_##suffix(ptr, a1); \
431 template<int n> inline \
432 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
434 hreg a1 = rshr(a.val, n); \
435 hreg b1 = rshr(b.val, n); \
436 return _Tpvec(vcombine_##suffix(a1, b1)); \
438 template<int n> inline \
439 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
441 hreg a1 = rshr(a.val, n); \
442 vst1_##suffix(ptr, a1); \
448 OPENCV_HAL_IMPL_NEON_PACK(
v_int16x8,
short, int16x4_t, s16,
v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
449 OPENCV_HAL_IMPL_NEON_PACK(
v_uint32x4,
unsigned, uint32x2_t, u32,
v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
450 OPENCV_HAL_IMPL_NEON_PACK(
v_int32x4,
int, int32x2_t, s32,
v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
458 uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
465 uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
466 uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
467 return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
474 uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
475 uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
476 uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
477 uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
479 uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
480 uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
481 return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
488 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
489 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
490 res = vmlaq_lane_f32(res, m1.val, vl, 1);
491 res = vmlaq_lane_f32(res, m2.val, vh, 0);
492 res = vmlaq_lane_f32(res, m3.val, vh, 1);
500 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
501 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
502 res = vmlaq_lane_f32(res, m1.val, vl, 1);
503 res = vmlaq_lane_f32(res, m2.val, vh, 0);
504 res = vaddq_f32(res, a.val);
508 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
509 inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \
511 return _Tpvec(intrin(a.val, b.val)); \
514 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add,
v_uint8x16, vqaddq_u8)
515 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub,
v_uint8x16, vqsubq_u8)
516 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add,
v_int8x16, vqaddq_s8)
517 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub,
v_int8x16, vqsubq_s8)
518 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add,
v_uint16x8, vqaddq_u16)
519 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub,
v_uint16x8, vqsubq_u16)
520 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add,
v_int16x8, vqaddq_s16)
521 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub,
v_int16x8, vqsubq_s16)
522 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add,
v_int32x4, vaddq_s32)
523 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub,
v_int32x4, vsubq_s32)
524 OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul,
v_int32x4, vmulq_s32)
525 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add,
v_uint32x4, vaddq_u32)
526 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub,
v_uint32x4, vsubq_u32)
527 OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul,
v_uint32x4, vmulq_u32)
528 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add,
v_float32x4, vaddq_f32)
529 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub,
v_float32x4, vsubq_f32)
530 OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul,
v_float32x4, vmulq_f32)
531 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add,
v_int64x2, vaddq_s64)
532 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub,
v_int64x2, vsubq_s64)
533 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add,
v_uint64x2, vaddq_u64)
534 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub,
v_uint64x2, vsubq_u64)
536 OPENCV_HAL_IMPL_NEON_BIN_OP(v_div,
v_float32x4, vdivq_f32)
537 OPENCV_HAL_IMPL_NEON_BIN_OP(v_add,
v_float64x2, vaddq_f64)
538 OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub,
v_float64x2, vsubq_f64)
539 OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul,
v_float64x2, vmulq_f64)
540 OPENCV_HAL_IMPL_NEON_BIN_OP(v_div,
v_float64x2, vdivq_f64)
544 float32x4_t reciprocal = vrecpeq_f32(b.val);
545 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
546 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
552 #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
553 inline _Tpvec v_mul (const _Tpvec& a, const _Tpvec& b) \
556 v_mul_expand(a, b, c, d); \
557 return v_pack(c, d); \
569 c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
571 d.val = vmull_high_s8(a.val, b.val);
573 d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
580 c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
582 d.val = vmull_high_u8(a.val, b.val);
584 d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
591 c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
593 d.val = vmull_high_s16(a.val, b.val);
595 d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
602 c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
604 d.val = vmull_high_u16(a.val, b.val);
606 d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
613 c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
615 d.val = vmull_high_u32(a.val, b.val);
617 d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
624 int32x4_t c = vmull_high_s16(a.val, b.val);
626 int32x4_t c = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
629 vshrn_n_s32(vmull_s16( vget_low_s16(a.val), vget_low_s16(b.val)), 16),
636 uint32x4_t c = vmull_high_u16(a.val, b.val);
638 uint32x4_t c = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
641 vshrn_n_u32(vmull_u16( vget_low_u16(a.val), vget_low_u16(b.val)), 16),
651 int16x8_t uzp1, uzp2;
652 _v128_unzip(a.val, b.val, uzp1, uzp2);
653 int16x4_t a0 = vget_low_s16(uzp1);
654 int16x4_t b0 = vget_high_s16(uzp1);
655 int16x4_t a1 = vget_low_s16(uzp2);
656 int16x4_t b1 = vget_high_s16(uzp2);
657 int32x4_t p = vmull_s16(a0, b0);
662 int16x8_t uzp1, uzp2;
663 _v128_unzip(a.val, b.val, uzp1, uzp2);
664 int16x4_t a0 = vget_low_s16(uzp1);
665 int16x4_t b0 = vget_high_s16(uzp1);
666 int16x4_t a1 = vget_low_s16(uzp2);
667 int16x4_t b1 = vget_high_s16(uzp2);
668 int32x4_t p = vmlal_s16(c.val, a0, b0);
675 int32x4_t uzp1, uzp2;
676 _v128_unzip(a.val, b.val, uzp1, uzp2);
677 int32x2_t a0 = vget_low_s32(uzp1);
678 int32x2_t b0 = vget_high_s32(uzp1);
679 int32x2_t a1 = vget_low_s32(uzp2);
680 int32x2_t b1 = vget_high_s32(uzp2);
681 int64x2_t p = vmull_s32(a0, b0);
686 int32x4_t uzp1, uzp2;
687 _v128_unzip(a.val, b.val, uzp1, uzp2);
688 int32x2_t a0 = vget_low_s32(uzp1);
689 int32x2_t b0 = vget_high_s32(uzp1);
690 int32x2_t a1 = vget_low_s32(uzp2);
691 int32x2_t b1 = vget_high_s32(uzp2);
692 int64x2_t p = vmlal_s32(c.val, a0, b0);
698 #define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \
699 inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b) \
701 return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\
703 inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
705 return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \
713 const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
714 const uint8x16_t
mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
715 const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
716 const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
718 uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(
mask, a.val, zero)),
719 vreinterpretq_u16_u8(vbslq_u8(
mask, b.val, zero)));
720 uint16x8_t odd = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
721 vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
723 uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
724 vreinterpretq_u32_u16(vbslq_u16(mask32, odd, zero32)));
725 uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
726 vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
737 int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
738 int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
739 int16x8_t uzp1, uzp2;
740 _v128_unzip(p0, p1, uzp1, uzp2);
741 int16x8_t
sum = vaddq_s16(uzp1, uzp2);
742 int16x4_t uzpl1, uzpl2;
743 _v128_unzip(vget_low_s16(
sum), vget_high_s16(
sum), uzpl1, uzpl2);
744 return v_int32x4(vaddl_s16(uzpl1, uzpl2));
755 const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
756 const uint16x8_t
mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
758 uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(
mask, a.val, zero)),
759 vreinterpretq_u32_u16(vbslq_u16(
mask, b.val, zero)));
760 uint32x4_t odd = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
761 vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
762 uint32x4_t uzp1, uzp2;
763 _v128_unzip(even, odd, uzp1, uzp2);
764 uint64x2_t s0 = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
765 uint64x2_t s1 = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
773 int32x4_t p0 = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
774 int32x4_t p1 = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
776 int32x4_t uzp1, uzp2;
777 _v128_unzip(p0, p1, uzp1, uzp2);
778 int32x4_t
sum = vaddq_s32(uzp1, uzp2);
780 int32x2_t uzpl1, uzpl2;
781 _v128_unzip(vget_low_s32(
sum), vget_high_s32(
sum), uzpl1, uzpl2);
782 return v_int64x2(vaddl_s32(uzpl1, uzpl2));
803 int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
804 return v_int32x4(vmlal_high_s16(p, a.val, b.val));
806 int16x4_t a0 = vget_low_s16(a.val);
807 int16x4_t a1 = vget_high_s16(a.val);
808 int16x4_t b0 = vget_low_s16(b.val);
809 int16x4_t b1 = vget_high_s16(b.val);
810 int32x4_t p = vmull_s16(a0, b0);
817 int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
818 return v_int32x4(vmlal_high_s16(p, a.val, b.val));
820 int16x4_t a0 = vget_low_s16(a.val);
821 int16x4_t a1 = vget_high_s16(a.val);
822 int16x4_t b0 = vget_low_s16(b.val);
823 int16x4_t b1 = vget_high_s16(b.val);
824 int32x4_t p = vmlal_s16(c.val, a0, b0);
833 int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
834 return v_int64x2(vmlal_high_s32(p, a.val, b.val));
836 int32x2_t a0 = vget_low_s32(a.val);
837 int32x2_t a1 = vget_high_s32(a.val);
838 int32x2_t b0 = vget_low_s32(b.val);
839 int32x2_t b1 = vget_high_s32(b.val);
840 int64x2_t p = vmull_s32(a0, b0);
847 int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
848 return v_int64x2(vmlal_high_s32(p, a.val, b.val));
850 int32x2_t a0 = vget_low_s32(a.val);
851 int32x2_t a1 = vget_high_s32(a.val);
852 int32x2_t b0 = vget_low_s32(b.val);
853 int32x2_t b1 = vget_high_s32(b.val);
854 int64x2_t p = vmlal_s32(c.val, a0, b0);
861 #define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \
862 inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b) \
864 return v_dotprod_expand(a, b); \
866 inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
868 return v_dotprod_expand(a, b, c); \
876 uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
877 uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
878 uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
879 uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
889 int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
890 prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
891 return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
902 uint32x4_t p0 = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
903 uint32x4_t p1 = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
904 uint64x2_t s0 = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
905 uint64x2_t s1 = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
913 int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
914 prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
915 return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
929 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
930 OPENCV_HAL_IMPL_NEON_BIN_OP(v_and, _Tpvec, vandq_##suffix) \
931 OPENCV_HAL_IMPL_NEON_BIN_OP(v_or, _Tpvec, vorrq_##suffix) \
932 OPENCV_HAL_IMPL_NEON_BIN_OP(v_xor, _Tpvec, veorq_##suffix) \
933 inline _Tpvec v_not (const _Tpvec& a) \
935 return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
939 OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_int8x16, s8)
940 OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_uint16x8, u16)
941 OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_int16x8, s16)
942 OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_uint32x4, u32)
943 OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_int32x4, s32)
944 OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_uint64x2, u64)
945 OPENCV_HAL_IMPL_NEON_LOGIC_OP(
v_int64x2, s64)
947 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
948 inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \
950 return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
953 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_and, vandq_s32)
954 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_or, vorrq_s32)
955 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_xor, veorq_s32)
959 return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
971 return v_div(one, v_sqrt(
x));
976 float32x4_t x1 = vmaxq_f32(
x.val, vdupq_n_f32(FLT_MIN));
977 float32x4_t e = vrsqrteq_f32(x1);
978 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
979 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
985 float32x4_t e = vrsqrteq_f32(
x.val);
986 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(
x.val, e), e), e);
987 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(
x.val, e), e), e);
992 #define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
993 inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
1003 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
1004 inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \
1006 return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
1009 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_and, vandq_s64)
1010 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_or, vorrq_s64)
1011 OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_xor, veorq_s64)
1015 return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
1026 return v_div(one, v_sqrt(
x));
1035 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
1036 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1038 return _Tpvec(intrin(a.val, b.val)); \
1041 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint8x16, v_min, vminq_u8)
1042 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint8x16, v_max, vmaxq_u8)
1043 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int8x16, v_min, vminq_s8)
1044 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int8x16, v_max, vmaxq_s8)
1045 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint16x8, v_min, vminq_u16)
1046 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint16x8, v_max, vmaxq_u16)
1047 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int16x8, v_min, vminq_s16)
1048 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int16x8, v_max, vmaxq_s16)
1049 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint32x4, v_min, vminq_u32)
1050 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint32x4, v_max, vmaxq_u32)
1051 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int32x4, v_min, vminq_s32)
1052 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int32x4, v_max, vmaxq_s32)
1053 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_float32x4, v_min, vminq_f32)
1054 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_float32x4, v_max, vmaxq_f32)
1056 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_float64x2, v_min, vminq_f64)
1057 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_float64x2, v_max, vmaxq_f64)
1060 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
1061 inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
1062 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
1063 inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
1064 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
1065 inline _Tpvec v_lt (const _Tpvec& a, const _Tpvec& b) \
1066 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
1067 inline _Tpvec v_gt (const _Tpvec& a, const _Tpvec& b) \
1068 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
1069 inline _Tpvec v_le (const _Tpvec& a, const _Tpvec& b) \
1070 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
1071 inline _Tpvec v_ge (const _Tpvec& a, const _Tpvec& b) \
1072 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
1074 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_uint8x16, OPENCV_HAL_NOP, u8, u8)
1075 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_int8x16, vreinterpretq_s8_u8, s8, u8)
1076 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_uint16x8, OPENCV_HAL_NOP, u16, u16)
1077 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_int16x8, vreinterpretq_s16_u16, s16, u16)
1078 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_uint32x4, OPENCV_HAL_NOP, u32, u32)
1079 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_int32x4, vreinterpretq_s32_u32, s32, u32)
1080 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_float32x4, vreinterpretq_f32_u32, f32, u32)
1081 #if defined(__aarch64__) || defined(_M_ARM64)
1082 static inline uint64x2_t vmvnq_u64(uint64x2_t a)
1084 uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
1085 return veorq_u64(a, vx);
1090 {
return v_uint64x2(vceqq_u64(a.val, b.val)); }
1092 {
return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
1094 {
return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
1096 {
return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
1100 uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
1101 uint32x4_t swapped = vrev64q_u32(cmp);
1102 return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
1106 uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
1107 uint32x4_t swapped = vrev64q_u32(cmp);
1108 uint64x2_t v_eq = vreinterpretq_u64_u32(vandq_u32(cmp, swapped));
1109 uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
1114 return v_reinterpret_as_s64(v_eq(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
1118 return v_reinterpret_as_s64(v_ne(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
1122 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(
v_float64x2, vreinterpretq_f64_u64, f64, u64)
1126 {
return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
1129 {
return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
1132 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint8x16, v_add_wrap, vaddq_u8)
1133 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int8x16, v_add_wrap, vaddq_s8)
1134 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint16x8, v_add_wrap, vaddq_u16)
1135 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int16x8, v_add_wrap, vaddq_s16)
1136 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint8x16, v_sub_wrap, vsubq_u8)
1137 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int8x16, v_sub_wrap, vsubq_s8)
1138 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint16x8, v_sub_wrap, vsubq_u16)
1139 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int16x8, v_sub_wrap, vsubq_s16)
1140 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint8x16, v_mul_wrap, vmulq_u8)
1141 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int8x16, v_mul_wrap, vmulq_s8)
1142 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_uint16x8, v_mul_wrap, vmulq_u16)
1143 OPENCV_HAL_IMPL_NEON_BIN_FUNC(
v_int16x8, v_mul_wrap, vmulq_s16)
1155 {
return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
1157 {
return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
1159 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
1160 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
1162 return _Tpvec2(cast(intrin(a.val, b.val))); \
1171 v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1177 return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
1185 return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
1187 return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
1193 return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
1198 return v_fma(a, b, c);
1203 return v_fma(a, b, c);
1209 v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1215 return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
1220 return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
1225 return v_fma(a, b, c);
1230 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
1231 inline _Tpvec v_shl (const _Tpvec& a, int n) \
1232 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
1233 inline _Tpvec v_shr (const _Tpvec& a, int n) \
1234 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
1235 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1236 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
1237 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1238 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
1239 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1240 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
1244 OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_uint16x8, u16,
short, s16)
1245 OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_int16x8, s16,
short, s16)
1246 OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_uint32x4, u32,
int, s32)
1247 OPENCV_HAL_IMPL_NEON_SHIFT_OP(
v_int32x4, s32,
int, s32)
1251 #define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
1252 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1253 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
1254 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1255 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, VTraits<_Tpvec>::nlanes - n)); } \
1256 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1258 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1259 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
1260 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1261 { return _Tpvec(vextq_##suffix(b.val, a.val, VTraits<_Tpvec>::nlanes - n)); } \
1262 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1263 { CV_UNUSED(b); return a; }
1265 OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_uint8x16, u8)
1266 OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_int8x16, s8)
1267 OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_uint16x8, u16)
1268 OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_int16x8, s16)
1269 OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_uint32x4, u32)
1270 OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_int32x4, s32)
1272 OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_uint64x2, u64)
1273 OPENCV_HAL_IMPL_NEON_ROTATE_OP(
v_int64x2, s64)
1278 #if defined(__clang__) && defined(__aarch64__)
1280 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1281 inline _Tpvec v_load_low(const _Tp* ptr) \
1283 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1284 uint64 v = *(unaligned_uint64*)ptr; \
1285 return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
1288 #define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1289 inline _Tpvec v_load_low(const _Tp* ptr) \
1290 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
1293 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
1294 inline _Tpvec v_load(const _Tp* ptr) \
1295 { return _Tpvec(vld1q_##suffix(ptr)); } \
1296 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1297 { return _Tpvec(vld1q_##suffix(ptr)); } \
1298 OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
1299 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1300 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
1301 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1302 { vst1q_##suffix(ptr, a.val); } \
1303 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1304 { vst1q_##suffix(ptr, a.val); } \
1305 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1306 { vst1q_##suffix(ptr, a.val); } \
1307 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
1308 { vst1q_##suffix(ptr, a.val); } \
1309 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1310 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
1311 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1312 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
1317 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_int16x8,
short, s16)
1318 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_uint32x4,
unsigned, u32)
1319 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_int32x4,
int, s32)
1322 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_float32x4,
float, f32)
1324 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(
v_float64x2,
double, f64)
1330 uint16_t t0 = vaddlvq_u8(a.val);
1333 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
1334 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1335 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1341 int16_t t0 = vaddlvq_s8(a.val);
1344 int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
1345 int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1346 return vget_lane_s32(vpadd_s32(t1, t1), 0);
1352 uint32_t t0 = vaddlvq_u16(a.val);
1355 uint32x4_t t0 = vpaddlq_u16(a.val);
1356 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1357 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1363 int32_t t0 = vaddlvq_s16(a.val);
1366 int32x4_t t0 = vpaddlq_s16(a.val);
1367 int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
1368 return vget_lane_s32(vpadd_s32(t1, t1), 0);
1373 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1374 inline scalartype v_reduce_##func(const _Tpvec& a) \
1376 return v##vectorfunc##vq_##suffix(a.val); \
1379 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1380 inline scalartype v_reduce_##func(const _Tpvec& a) \
1382 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1383 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1384 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1385 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1395 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1396 inline scalartype v_reduce_##func(const _Tpvec& a) \
1398 return v##vectorfunc##vq_##suffix(a.val); \
1401 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1402 inline scalartype v_reduce_##func(const _Tpvec& a) \
1404 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1405 a0 = vp##vectorfunc##_##suffix(a0, a0); \
1406 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
1412 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(
v_int16x8, int16x4,
short,
max,
max, s16)
1413 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(
v_int16x8, int16x4,
short,
min,
min, s16)
1416 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1417 inline scalartype v_reduce_##func(const _Tpvec& a) \
1419 return v##vectorfunc##vq_##suffix(a.val); \
1422 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
1423 inline scalartype v_reduce_##func(const _Tpvec& a) \
1425 _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
1426 return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
1430 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_uint32x4, uint32x2,
unsigned,
sum,
add, u32)
1431 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_uint32x4, uint32x2,
unsigned,
max,
max, u32)
1432 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_uint32x4, uint32x2,
unsigned,
min,
min, u32)
1433 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_int32x4, int32x2,
int,
sum,
add, s32)
1434 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_int32x4, int32x2,
int,
max,
max, s32)
1435 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(
v_int32x4, int32x2,
int,
min,
min, s32)
1443 return vaddvq_u64(a.val);
1445 return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
1451 return vaddvq_s64(a.val);
1453 return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
1459 return vaddvq_f64(a.val);
1467 float32x4_t ab = vpaddq_f32(a.val, b.val);
1468 float32x4_t cd = vpaddq_f32(c.val, d.val);
1471 float32x4x2_t ab = vtrnq_f32(a.val, b.val);
1472 float32x4x2_t cd = vtrnq_f32(c.val, d.val);
1474 float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]);
1475 float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]);
1477 float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
1478 float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
1487 uint8x16_t t0 = vabdq_u8(a.val, b.val);
1488 uint16_t t1 = vaddlvq_u8(t0);
1491 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
1492 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1493 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1499 uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
1500 uint16_t t1 = vaddlvq_u8(t0);
1503 uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
1504 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1505 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1511 uint16x8_t t0 = vabdq_u16(a.val, b.val);
1512 uint32_t t1 = vaddlvq_u16(t0);
1515 uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
1516 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1517 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1523 uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
1524 uint32_t t1 = vaddlvq_u16(t0);
1527 uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
1528 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1529 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1535 uint32x4_t t0 = vabdq_u32(a.val, b.val);
1536 uint32_t t1 = vaddvq_u32(t0);
1539 uint32x4_t t0 = vabdq_u32(a.val, b.val);
1540 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1541 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1547 uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1548 uint32_t t1 = vaddvq_u32(t0);
1551 uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
1552 uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
1553 return vget_lane_u32(vpadd_u32(t1, t1), 0);
1559 float32x4_t t0 = vabdq_f32(a.val, b.val);
1560 return vaddvq_f32(t0);
1562 float32x4_t t0 = vabdq_f32(a.val, b.val);
1563 float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
1564 return vget_lane_f32(vpadd_f32(t1, t1), 0);
1571 {
return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
1573 {
return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
1575 {
return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
1577 {
return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
1579 {
return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
1581 {
return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
1583 {
return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
1588 const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
1589 const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
1590 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
1591 uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
1592 uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
1595 int8x8_t m0 = vcreate_s8(
CV_BIG_UINT(0x0706050403020100));
1596 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
1597 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
1598 return (
int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
1603 {
return v_signmask(v_reinterpret_as_u8(a)); }
1608 const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
1609 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
1610 uint32_t t0 = vaddlvq_u16(v0);
1613 int16x4_t m0 = vcreate_s16(
CV_BIG_UINT(0x0003000200010000));
1614 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
1615 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
1616 return (
int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
1620 {
return v_signmask(v_reinterpret_as_u16(a)); }
1625 const int32x4_t signPosition = {0,1,2,3};
1626 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
1627 uint32_t t0 = vaddvq_u32(v0);
1630 int32x2_t m0 = vcreate_s32(
CV_BIG_UINT(0x0000000100000000));
1631 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
1632 uint64x2_t v1 = vpaddlq_u32(v0);
1633 return (
int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
1637 {
return v_signmask(v_reinterpret_as_u32(a)); }
1639 {
return v_signmask(v_reinterpret_as_u32(a)); }
1643 const int64x2_t signPosition = {0,1};
1644 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
1645 int t0 = (int)vaddvq_u64(v0);
1648 int64x1_t m0 = vdup_n_s64(0);
1649 uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
1650 return (
int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
1654 {
return v_signmask(v_reinterpret_as_u64(a)); }
1657 {
return v_signmask(v_reinterpret_as_u64(a)); }
1674 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1675 inline bool v_check_all(const v_##_Tpvec& a) \
1677 return (vminvq_##suffix(a.val) >> shift) != 0; \
1679 inline bool v_check_any(const v_##_Tpvec& a) \
1681 return (vmaxvq_##suffix(a.val) >> shift) != 0; \
1684 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
1685 inline bool v_check_all(const v_##_Tpvec& a) \
1687 _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
1688 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1689 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
1691 inline bool v_check_any(const v_##_Tpvec& a) \
1693 _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
1694 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
1695 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
1699 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
1700 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
1701 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
1705 uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1706 return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
1710 uint64x2_t v0 = vshrq_n_u64(a.val, 63);
1711 return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
1743 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
1744 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1746 return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
1749 OPENCV_HAL_IMPL_NEON_SELECT(
v_uint8x16, u8, u8)
1750 OPENCV_HAL_IMPL_NEON_SELECT(
v_int8x16, s8, u8)
1751 OPENCV_HAL_IMPL_NEON_SELECT(
v_uint16x8, u16, u16)
1752 OPENCV_HAL_IMPL_NEON_SELECT(
v_int16x8, s16, u16)
1753 OPENCV_HAL_IMPL_NEON_SELECT(
v_uint32x4, u32, u32)
1754 OPENCV_HAL_IMPL_NEON_SELECT(
v_int32x4, s32, u32)
1755 OPENCV_HAL_IMPL_NEON_SELECT(
v_float32x4, f32, u32)
1757 OPENCV_HAL_IMPL_NEON_SELECT(
v_float64x2, f64, u64)
1761 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1762 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1764 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1765 b1.val = vmovl_high_##suffix(a.val); \
1767 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1769 return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1771 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1773 return _Tpwvec(vmovl_high_##suffix(a.val)); \
1775 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1777 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1780 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
1781 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1783 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
1784 b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
1786 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1788 return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
1790 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1792 return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
1794 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1796 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
1810 uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
1811 uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
1818 int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
1819 int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
1823 #if defined(__aarch64__) || defined(_M_ARM64)
1824 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1825 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1827 b0.val = vzip1q_##suffix(a0.val, a1.val); \
1828 b1.val = vzip2q_##suffix(a0.val, a1.val); \
1830 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1832 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1834 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1836 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1838 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1840 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1841 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1844 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
1845 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
1847 _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
1848 b0.val = p.val[0]; \
1849 b1.val = p.val[1]; \
1851 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1853 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
1855 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1857 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
1859 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
1861 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
1862 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
1866 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
1867 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
1868 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
1869 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
1870 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
1871 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
1872 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
1874 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
1879 uint8x16_t vec = vrev64q_u8(a.val);
1884 {
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1888 uint16x8_t vec = vrev64q_u16(a.val);
1893 {
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
1897 uint32x4_t vec = vrev64q_u32(a.val);
1902 {
return v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
1905 {
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
1909 uint64x2_t vec = a.val;
1910 uint64x1_t vec_lo = vget_low_u64(vec);
1911 uint64x1_t vec_hi = vget_high_u64(vec);
1912 return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
1916 {
return v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
1920 {
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
1923 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
1925 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
1927 return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
1930 OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
1931 OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
1932 OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
1933 OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
1934 OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
1935 OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
1936 OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
1937 OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
1938 OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
1940 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
1943 #define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
1944 template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
1949 OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_int16x8,
short, s16)
1951 OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_int32x4,
int, s32)
1954 OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_float32x4,
float, f32)
1956 OPENCV_HAL_IMPL_NEON_EXTRACT_N(
v_float64x2,
double, f64)
1959 #define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
1960 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
1965 OPENCV_HAL_IMPL_NEON_BROADCAST(
v_int16x8,
short, s16)
1967 OPENCV_HAL_IMPL_NEON_BROADCAST(
v_int32x4,
int, s32)
1970 OPENCV_HAL_IMPL_NEON_BROADCAST(
v_float32x4,
float, f32)
1972 OPENCV_HAL_IMPL_NEON_BROADCAST(
v_float64x2,
double, f64)
1978 float32x4_t a_ = a.val;
1980 #if defined _MSC_VER
1981 result = vcvtnq_s32_f32(a_);
1983 __asm__ (
"fcvtns %0.4s, %1.4s"
1994 float32x4_t
delta = vdupq_n_f32(12582912.0f);
2000 int32x4_t a1 = vcvtq_s32_f32(a.val);
2001 uint32x4_t
mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
2002 return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(
mask)));
2007 int32x4_t a1 = vcvtq_s32_f32(a.val);
2008 uint32x4_t
mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
2009 return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(
mask)));
2013 {
return v_int32x4(vcvtq_s32_f32(a.val)); }
2018 static const int32x2_t zero = vdup_n_s32(0);
2019 return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), zero));
2024 return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), vmovn_s64(vcvtnq_s64_f64(b.val))));
2029 static const int32x2_t zero = vdup_n_s32(0);
2030 int64x2_t a1 = vcvtq_s64_f64(a.val);
2031 uint64x2_t
mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
2032 a1 = vaddq_s64(a1, vreinterpretq_s64_u64(
mask));
2033 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
2038 static const int32x2_t zero = vdup_n_s32(0);
2039 int64x2_t a1 = vcvtq_s64_f64(a.val);
2040 uint64x2_t
mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
2041 a1 = vsubq_s64(a1, vreinterpretq_s64_u64(
mask));
2042 return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
2047 static const int32x2_t zero = vdup_n_s32(0);
2048 return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
2053 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2054 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2055 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2056 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2057 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2060 _Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
2061 vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2062 vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2063 _Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
2064 vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2065 vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2066 _Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
2067 vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
2068 vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
2069 _Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
2070 vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
2071 vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
2073 b0.val = vtrn1q_##suffix##32(t0, t1); \
2074 b1.val = vtrn2q_##suffix##32(t0, t1); \
2075 b2.val = vtrn1q_##suffix##32(t2, t3); \
2076 b3.val = vtrn2q_##suffix##32(t2, t3); \
2079 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
2080 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
2081 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
2083 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
2084 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2085 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2086 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2087 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2093 _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
2094 _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
2099 b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
2100 b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
2101 b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
2102 b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
2105 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
2106 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
2107 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
2110 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
2111 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2113 _Tpvec##x2_t v = vld2q_##suffix(ptr); \
2117 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2119 _Tpvec##x3_t v = vld3q_##suffix(ptr); \
2124 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2125 v_##_Tpvec& c, v_##_Tpvec& d) \
2127 _Tpvec##x4_t v = vld4q_##suffix(ptr); \
2133 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2134 hal::StoreMode =hal::STORE_UNALIGNED) \
2139 vst2q_##suffix(ptr, v); \
2141 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2142 const v_##_Tpvec& c, hal::StoreMode =hal::STORE_UNALIGNED) \
2148 vst3q_##suffix(ptr, v); \
2150 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2151 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2152 hal::StoreMode =hal::STORE_UNALIGNED ) \
2159 vst4q_##suffix(ptr, v); \
2162 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
2163 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
2165 tp##x1_t a0 = vld1_##suffix(ptr); \
2166 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2167 tp##x1_t a1 = vld1_##suffix(ptr + 2); \
2168 tp##x1_t b1 = vld1_##suffix(ptr + 3); \
2169 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2170 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2173 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
2174 v_##tp##x2& b, v_##tp##x2& c ) \
2176 tp##x1_t a0 = vld1_##suffix(ptr); \
2177 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2178 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2179 tp##x1_t a1 = vld1_##suffix(ptr + 3); \
2180 tp##x1_t b1 = vld1_##suffix(ptr + 4); \
2181 tp##x1_t c1 = vld1_##suffix(ptr + 5); \
2182 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2183 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2184 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2187 inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
2188 v_##tp##x2& c, v_##tp##x2& d ) \
2190 tp##x1_t a0 = vld1_##suffix(ptr); \
2191 tp##x1_t b0 = vld1_##suffix(ptr + 1); \
2192 tp##x1_t c0 = vld1_##suffix(ptr + 2); \
2193 tp##x1_t d0 = vld1_##suffix(ptr + 3); \
2194 tp##x1_t a1 = vld1_##suffix(ptr + 4); \
2195 tp##x1_t b1 = vld1_##suffix(ptr + 5); \
2196 tp##x1_t c1 = vld1_##suffix(ptr + 6); \
2197 tp##x1_t d1 = vld1_##suffix(ptr + 7); \
2198 a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
2199 b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
2200 c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
2201 d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
2204 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2205 hal::StoreMode =hal::STORE_UNALIGNED) \
2207 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2208 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2209 vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
2210 vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
2213 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
2214 const v_##tp##x2& b, const v_##tp##x2& c, \
2215 hal::StoreMode =hal::STORE_UNALIGNED) \
2217 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2218 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2219 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2220 vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
2221 vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
2222 vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
2225 inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
2226 const v_##tp##x2& c, const v_##tp##x2& d, \
2227 hal::StoreMode =hal::STORE_UNALIGNED) \
2229 vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
2230 vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
2231 vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
2232 vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
2233 vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
2234 vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
2235 vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
2236 vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
2239 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16,
uchar, u8)
2240 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16,
schar, s8)
2241 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8,
ushort, u16)
2242 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8,
short, s16)
2243 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4,
unsigned, u32)
2244 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4,
int, s32)
2245 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4,
float, f32)
2247 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2,
double, f64)
2250 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(
int64, s64)
2251 OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(
uint64, u64)
2261 float32x2_t zero = vdup_n_f32(0.0f);
2262 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
2267 return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
2272 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
2277 return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
2282 return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
2287 return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
2402 return v_int16x8(vcombine_s16(vld1_s16(tab +
idx[0]), vld1_s16(tab +
idx[1])));
2421 return v_int32x4(vcombine_s32(vld1_s32(tab +
idx[0]), vld1_s32(tab +
idx[1])));
2433 return v_int64x2(vcombine_s64(vcreate_s64(tab[
idx[0]]), vcreate_s64(tab[
idx[1]])));
2439 inline v_uint64x2 v_lut(
const uint64_t* tab,
const int*
idx) {
return v_reinterpret_as_u64(
v_lut((
const int64_t *)tab,
idx)); }
2459 *(unaligned_uint64*)(tab +
idx[0]),
2460 *(unaligned_uint64*)(tab +
idx[1])
2462 return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
2473 tab[vgetq_lane_s32(idxvec.val, 0)],
2474 tab[vgetq_lane_s32(idxvec.val, 1)],
2475 tab[vgetq_lane_s32(idxvec.val, 2)],
2476 tab[vgetq_lane_s32(idxvec.val, 3)]
2485 tab[vgetq_lane_s32(idxvec.val, 0)],
2486 tab[vgetq_lane_s32(idxvec.val, 1)],
2487 tab[vgetq_lane_s32(idxvec.val, 2)],
2488 tab[vgetq_lane_s32(idxvec.val, 3)]
2497 tab[vgetq_lane_s32(idxvec.val, 0)],
2498 tab[vgetq_lane_s32(idxvec.val, 1)],
2499 tab[vgetq_lane_s32(idxvec.val, 2)],
2500 tab[vgetq_lane_s32(idxvec.val, 3)]
2525 return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
2530 return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
2536 return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
2541 int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
2542 return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
2548 int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
2549 return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
2556 return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
2562 return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
2590 tab[vgetq_lane_s32(idxvec.val, 0)],
2591 tab[vgetq_lane_s32(idxvec.val, 1)],
2612 (float16x4_t)vld1_s16((
const short*)ptr);
2614 vld1_f16((
const __fp16*)ptr);
2621 float16x4_t hv = vcvt_f16_f32(v.val);
2624 vst1_s16((
short*)ptr, (int16x4_t)hv);
2626 vst1_f16((__fp16*)ptr, hv);
2634 for(
int i = 0; i < N; i++ ) buf[i] = (
float)ptr[i];
2643 for(
int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
2649 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray T
Definition: calib3d.hpp:1867
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
#define CV_BIG_UINT(n)
Definition: interface.h:64
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition: intrin_cpp.hpp:1584
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition: intrin_cpp.hpp:2190
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition: intrin_cpp.hpp:1020
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition: intrin_cpp.hpp:1033
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
OutputArray sum
Definition: imgproc.hpp:2882
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437