5 #ifndef OPENCV_HAL_INTRIN_MSA_HPP
6 #define OPENCV_HAL_INTRIN_MSA_HPP
9 #include "opencv2/core/utility.hpp"
15 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
21 #define CV_SIMD128_64F 1
25 typedef uchar lane_type;
33 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
39 return msa_getq_lane_u8(val, 0);
47 typedef schar lane_type;
55 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
61 return msa_getq_lane_s8(val, 0);
76 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
77 val = msa_ld1q_u16(v);
82 return msa_getq_lane_u16(val, 0);
90 typedef short lane_type;
95 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
97 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
98 val = msa_ld1q_s16(v);
103 return msa_getq_lane_s16(val, 0);
111 typedef unsigned int lane_type;
116 v_uint32x4(
unsigned int v0,
unsigned int v1,
unsigned int v2,
unsigned int v3)
118 unsigned int v[] = {v0, v1, v2, v3};
119 val = msa_ld1q_u32(v);
122 unsigned int get0()
const
124 return msa_getq_lane_u32(val, 0);
132 typedef int lane_type;
137 v_int32x4(
int v0,
int v1,
int v2,
int v3)
139 int v[] = {v0, v1, v2, v3};
140 val = msa_ld1q_s32(v);
145 return msa_getq_lane_s32(val, 0);
153 typedef float lane_type;
158 v_float32x4(
float v0,
float v1,
float v2,
float v3)
160 float v[] = {v0, v1, v2, v3};
161 val = msa_ld1q_f32(v);
166 return msa_getq_lane_f32(val, 0);
182 val = msa_ld1q_u64(v);
187 return msa_getq_lane_u64(val, 0);
195 typedef int64 lane_type;
202 int64 v[] = {v0, v1};
203 val = msa_ld1q_s64(v);
208 return msa_getq_lane_s64(val, 0);
216 typedef double lane_type;
223 double v[] = {v0, v1};
224 val = msa_ld1q_f64(v);
229 return msa_getq_lane_f64(val, 0);
235 #define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
236 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
237 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
238 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
239 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
240 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
241 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \
242 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \
243 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \
244 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \
245 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \
246 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \
247 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); }
249 OPENCV_HAL_IMPL_MSA_INIT(uint8x16,
uchar, u8)
250 OPENCV_HAL_IMPL_MSA_INIT(int8x16,
schar, s8)
251 OPENCV_HAL_IMPL_MSA_INIT(uint16x8,
ushort, u16)
252 OPENCV_HAL_IMPL_MSA_INIT(int16x8,
short, s16)
253 OPENCV_HAL_IMPL_MSA_INIT(uint32x4,
unsigned int, u32)
254 OPENCV_HAL_IMPL_MSA_INIT(int32x4,
int, s32)
255 OPENCV_HAL_IMPL_MSA_INIT(uint64x2,
uint64, u64)
256 OPENCV_HAL_IMPL_MSA_INIT(int64x2,
int64, s64)
257 OPENCV_HAL_IMPL_MSA_INIT(float32x4,
float, f32)
258 OPENCV_HAL_IMPL_MSA_INIT(float64x2,
double, f64)
260 #define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \
261 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
263 return _Tpvec(mov(a.val, b.val)); \
265 template<int n> inline \
266 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
268 return _Tpvec(rshr(a.val, b.val, n)); \
280 #define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
281 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
283 hreg a1 = mov(a.val); \
284 msa_st1_##suffix(ptr, a1); \
286 template<int n> inline \
287 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
289 hreg a1 = rshr(a.val, n); \
290 msa_st1_##suffix(ptr, a1); \
296 OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_int16x8,
short, v4i16, s16,
v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32)
297 OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_uint32x4,
unsigned, v2u32, u32,
v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64)
298 OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_int32x4,
int, v2i32, s32,
v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64)
305 return v_uint8x16(msa_pack_u16(a.val, b.val));
311 return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val)));
318 v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val));
319 v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val));
328 v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
329 res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
330 res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
331 res = msa_mlaq_lane_f32(res, m3.val, v0, 3);
340 v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
341 res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
342 res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
343 res = msa_addq_f32(res, a.val);
347 #define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
348 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
350 return _Tpvec(intrin(a.val, b.val)); \
352 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
354 a.val = intrin(a.val, b.val); \
358 OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_uint8x16, msa_qaddq_u8)
359 OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_uint8x16, msa_qsubq_u8)
360 OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_int8x16, msa_qaddq_s8)
361 OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_int8x16, msa_qsubq_s8)
362 OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_uint16x8, msa_qaddq_u16)
363 OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_uint16x8, msa_qsubq_u16)
364 OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_int16x8, msa_qaddq_s16)
365 OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_int16x8, msa_qsubq_s16)
366 OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_int32x4, msa_addq_s32)
367 OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_int32x4, msa_subq_s32)
368 OPENCV_HAL_IMPL_MSA_BIN_OP(*,
v_int32x4, msa_mulq_s32)
369 OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_uint32x4, msa_addq_u32)
370 OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_uint32x4, msa_subq_u32)
371 OPENCV_HAL_IMPL_MSA_BIN_OP(*,
v_uint32x4, msa_mulq_u32)
372 OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_float32x4, msa_addq_f32)
373 OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_float32x4, msa_subq_f32)
374 OPENCV_HAL_IMPL_MSA_BIN_OP(*,
v_float32x4, msa_mulq_f32)
375 OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_int64x2, msa_addq_s64)
376 OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_int64x2, msa_subq_s64)
377 OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_uint64x2, msa_addq_u64)
378 OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_uint64x2, msa_subq_u64)
379 OPENCV_HAL_IMPL_MSA_BIN_OP(/,
v_float32x4, msa_divq_f32)
380 OPENCV_HAL_IMPL_MSA_BIN_OP(+,
v_float64x2, msa_addq_f64)
381 OPENCV_HAL_IMPL_MSA_BIN_OP(-,
v_float64x2, msa_subq_f64)
382 OPENCV_HAL_IMPL_MSA_BIN_OP(*,
v_float64x2, msa_mulq_f64)
383 OPENCV_HAL_IMPL_MSA_BIN_OP(/,
v_float64x2, msa_divq_f64)
386 #define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \
387 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
390 v_mul_expand(a, b, c, d); \
391 return v_pack(c, d); \
393 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
394 {a = a * b; return a; }
405 v16i8 a_lo, a_hi, b_lo, b_hi;
407 ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi);
408 ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi);
409 c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo));
410 d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi));
416 v16u8 a_lo, a_hi, b_lo, b_hi;
418 ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi);
419 ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi);
420 c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo));
421 d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi));
427 v8i16 a_lo, a_hi, b_lo, b_hi;
429 ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
430 ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
431 c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo));
432 d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi));
438 v8u16 a_lo, a_hi, b_lo, b_hi;
440 ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
441 ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
442 c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo));
443 d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi));
449 v4u32 a_lo, a_hi, b_lo, b_hi;
451 ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi);
452 ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi);
453 c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo));
454 d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi));
459 v8i16 a_lo, a_hi, b_lo, b_hi;
461 ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
462 ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
464 return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)),
465 msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16));
470 v8u16 a_lo, a_hi, b_lo, b_hi;
472 ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
473 ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
475 return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)),
476 msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
483 {
return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
485 {
return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
489 {
return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
491 {
return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
496 v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
497 v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
498 v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
499 v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
500 v4u32 prod = msa_dotp_u_w(even_a, even_b);
501 return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
505 v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
506 v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
507 v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
508 v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
509 v4u32 prod = msa_dpadd_u_w(c.val, even_a, even_b);
510 return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
515 v8i16 prod = msa_dotp_s_h(a.val, b.val);
516 return v_int32x4(msa_hadd_s32(prod, prod));
525 v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
526 v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
527 v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
528 v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
529 v2u64 prod = msa_dotp_u_d(even_a, even_b);
530 return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
535 v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
536 v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
537 v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
538 v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
539 v2u64 prod = msa_dpadd_u_d(c.val, even_a, even_b);
540 return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
545 v4i32 prod = msa_dotp_s_w(a.val, b.val);
546 return v_int64x2(msa_hadd_s64(prod, prod));
598 #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
599 OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \
600 OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \
601 OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \
602 inline _Tpvec operator ~ (const _Tpvec& a) \
604 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
607 OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint8x16, v16u8, u8)
608 OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int8x16, v16i8, s8)
609 OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint16x8, v8u16, u16)
610 OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int16x8, v8i16, s16)
611 OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint32x4, v4u32, u32)
612 OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int32x4, v4i32, s32)
613 OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint64x2, v2u64, u64)
614 OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int64x2, v2i64, s64)
616 #define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
617 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
619 return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
621 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
623 a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
627 OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
628 OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
629 OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
633 return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
637 #define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
638 inline _Tpuvec v_abs(const _Tpsvec& a) \
640 return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \
648 #define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \
649 inline _Tpvec func(const _Tpvec& a) \
651 return _Tpvec(intrin(a.val)); \
654 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float32x4, v_abs, msa_absq_f32)
655 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float64x2, v_abs, msa_absq_f64)
656 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float32x4, v_sqrt, msa_sqrtq_f32)
658 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float64x2, v_sqrt, msa_sqrtq_f64)
661 #define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
662 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
664 return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
666 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
668 a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
672 OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
673 OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
674 OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
678 return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
683 #define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \
684 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
686 return _Tpvec(intrin(a.val, b.val)); \
689 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_min, msa_minq_u8)
690 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_max, msa_maxq_u8)
691 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_min, msa_minq_s8)
692 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_max, msa_maxq_s8)
693 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_min, msa_minq_u16)
694 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_max, msa_maxq_u16)
695 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_min, msa_minq_s16)
696 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_max, msa_maxq_s16)
697 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint32x4, v_min, msa_minq_u32)
698 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint32x4, v_max, msa_maxq_u32)
699 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int32x4, v_min, msa_minq_s32)
700 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int32x4, v_max, msa_maxq_s32)
701 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float32x4, v_min, msa_minq_f32)
702 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float32x4, v_max, msa_maxq_f32)
703 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float64x2, v_min, msa_minq_f64)
704 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float64x2, v_max, msa_maxq_f64)
706 #define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
707 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
708 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
709 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
710 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
711 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
712 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
713 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
714 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
715 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
716 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
717 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
718 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
720 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint8x16, v16u8, u8, u8)
721 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int8x16, v16i8, s8, u8)
722 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint16x8, v8u16, u16, u16)
723 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int16x8, v8i16, s16, u16)
724 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint32x4, v4u32, u32, u32)
725 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int32x4, v4i32, s32, u32)
726 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_float32x4, v4f32, f32, u32)
727 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint64x2, v2u64, u64, u64)
728 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int64x2, v2i64, s64, u64)
729 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_float64x2, v2f64, f64, u64)
732 {
return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); }
734 {
return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); }
736 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_add_wrap, msa_addq_u8)
737 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_add_wrap, msa_addq_s8)
738 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_add_wrap, msa_addq_u16)
739 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_add_wrap, msa_addq_s16)
740 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_sub_wrap, msa_subq_u8)
741 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_sub_wrap, msa_subq_s8)
742 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_sub_wrap, msa_subq_u16)
743 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_sub_wrap, msa_subq_s16)
744 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_mul_wrap, msa_mulq_u8)
745 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_mul_wrap, msa_mulq_s8)
746 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_mul_wrap, msa_mulq_u16)
747 OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_mul_wrap, msa_mulq_s16)
759 #define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \
760 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
762 return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \
772 v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
778 return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
783 return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val));
788 return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val));
793 return v_fma(a, b, c);
798 return v_fma(a, b, c);
803 v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
809 return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
814 return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val));
819 return v_fma(a, b, c);
823 #define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
824 inline _Tpvec operator << (const _Tpvec& a, int n) \
825 { return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
826 inline _Tpvec operator >> (const _Tpvec& a, int n) \
827 { return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
828 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
829 { return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
830 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
831 { return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \
832 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
833 { return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); }
837 OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_uint16x8, u16,
short, s16)
838 OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_int16x8, s16,
short, s16)
839 OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_uint32x4, u32,
int, s32)
840 OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_int32x4, s32,
int, s32)
845 #define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \
846 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
848 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \
850 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
852 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
854 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
858 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
860 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \
862 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
864 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
866 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
872 OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint8x16, v16u8, v16i8, s8)
873 OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int8x16, v16i8, v16i8, s8)
874 OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint16x8, v8u16, v8i16, s16)
875 OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int16x8, v8i16, v8i16, s16)
876 OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint32x4, v4u32, v4i32, s32)
877 OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int32x4, v4i32, v4i32, s32)
878 OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_float32x4, v4f32, v4i32, s32)
879 OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint64x2, v2u64, v2i64, s64)
880 OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int64x2, v2i64, v2i64, s64)
881 OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_float64x2, v2f64, v2i64, s64)
883 #define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
884 inline _Tpvec v_load(const _Tp* ptr) \
885 { return _Tpvec(msa_ld1q_##suffix(ptr)); } \
886 inline _Tpvec v_load_aligned(const _Tp* ptr) \
887 { return _Tpvec(msa_ld1q_##suffix(ptr)); } \
888 inline _Tpvec v_load_low(const _Tp* ptr) \
889 { return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \
890 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
891 { return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \
892 inline void v_store(_Tp* ptr, const _Tpvec& a) \
893 { msa_st1q_##suffix(ptr, a.val); } \
894 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
895 { msa_st1q_##suffix(ptr, a.val); } \
896 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
897 { msa_st1q_##suffix(ptr, a.val); } \
898 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
899 { msa_st1q_##suffix(ptr, a.val); } \
900 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
902 int n = _Tpvec::nlanes; \
903 for( int i = 0; i < (n/2); i++ ) \
906 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
908 int n = _Tpvec::nlanes; \
909 for( int i = 0; i < (n/2); i++ ) \
910 ptr[i] = a.val[i+(n/2)]; \
916 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_int16x8,
short, s16)
917 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_uint32x4,
unsigned, u32)
918 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_int32x4,
int, s32)
921 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_float32x4,
float, f32)
922 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_float64x2,
double, f64)
928 v_uint8x16 c =
v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
933 {
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
937 v_uint16x8 c =
v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
942 {
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
955 {
return v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
958 {
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
969 {
return v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
972 {
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
975 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
976 inline unsigned short v_reduce_##func(const v_uint16x8& a) \
979 ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \
980 v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \
982 ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \
983 v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \
984 return (unsigned short)cfunc(c[0], c[1]); \
990 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \
991 inline short v_reduce_##func(const v_int16x8& a) \
994 ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \
995 v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \
997 ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \
998 v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \
999 return (short)cfunc(c[0], c[1]); \
1005 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \
1006 inline scalartype v_reduce_##func(const _Tpvec& a) \
1008 return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \
1019 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(_Tpvec, scalartype, _Tpvec2, func) \
1020 inline scalartype v_reduce_##func(const _Tpvec& a) \
1023 v_expand(a, a1, a2); \
1024 return (scalartype)v_reduce_##func(v_##func(a1, a2)); \
1034 #define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \
1035 inline scalartype v_reduce_sum(const _Tpvec& a) \
1037 return (scalartype)msa_sum_##suffix(a.val); \
1040 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_uint8x16,
unsigned short, u8)
1041 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_int8x16,
short, s8)
1042 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_uint16x8,
unsigned, u16)
1043 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_int16x8,
int, s16)
1044 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_uint32x4, uint64_t, u32)
1045 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_int32x4, int64_t, s32)
1046 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_float32x4,
float, f32)
1049 {
return (
uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); }
1051 {
return (
int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); }
1054 return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1);
1061 v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))),
1062 MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))));
1063 v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))),
1064 MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))));
1066 return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))),
1067 MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0)))));
1072 v16u8 t0 = msa_abdq_u8(a.val, b.val);
1073 v8u16 t1 = msa_paddlq_u8(t0);
1074 v4u32 t2 = msa_paddlq_u16(t1);
1075 return msa_sum_u32(t2);
1079 v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val));
1080 v8u16 t1 = msa_paddlq_u8(t0);
1081 v4u32 t2 = msa_paddlq_u16(t1);
1082 return msa_sum_u32(t2);
1086 v8u16 t0 = msa_abdq_u16(a.val, b.val);
1087 v4u32 t1 = msa_paddlq_u16(t0);
1088 return msa_sum_u32(t1);
1092 v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val));
1093 v4u32 t1 = msa_paddlq_u16(t0);
1094 return msa_sum_u32(t1);
1098 v4u32 t0 = msa_abdq_u32(a.val, b.val);
1099 return msa_sum_u32(t0);
1103 v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val));
1104 return msa_sum_u32(t0);
1108 v4f32 t0 = msa_abdq_f32(a.val, b.val);
1109 return msa_sum_f32(t0);
1113 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \
1114 inline v_uint8x16 v_popcount(const _Tpvec& a) \
1116 v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \
1117 return v_uint8x16(t); \
1119 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(
v_uint8x16)
1120 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(
v_int8x16)
1122 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \
1123 inline v_uint16x8 v_popcount(const _Tpvec& a) \
1125 v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \
1126 return v_uint16x8(t); \
1128 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(
v_uint16x8)
1129 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(
v_int16x8)
1131 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \
1132 inline v_uint32x4 v_popcount(const _Tpvec& a) \
1134 v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \
1135 return v_uint32x4(t); \
1137 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(
v_uint32x4)
1138 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(
v_int32x4)
1140 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \
1141 inline v_uint64x2 v_popcount(const _Tpvec& a) \
1143 v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \
1144 return v_uint64x2(t); \
1146 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(
v_uint64x2)
1147 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(
v_int64x2)
1151 v8i8 m0 = msa_create_s8(
CV_BIG_UINT(0x0706050403020100));
1152 v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0));
1153 v8u16 v1 = msa_paddlq_u8(v0);
1154 v4u32 v2 = msa_paddlq_u16(v1);
1155 v2u64 v3 = msa_paddlq_u32(v2);
1156 return (
int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8);
1159 {
return v_signmask(v_reinterpret_as_u8(a)); }
1163 v4i16 m0 = msa_create_s16(
CV_BIG_UINT(0x0003000200010000));
1164 v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0));
1165 v4u32 v1 = msa_paddlq_u16(v0);
1166 v2u64 v2 = msa_paddlq_u32(v1);
1167 return (
int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4);
1170 {
return v_signmask(v_reinterpret_as_u16(a)); }
1174 v2i32 m0 = msa_create_s32(
CV_BIG_UINT(0x0000000100000000));
1175 v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0));
1176 v2u64 v1 = msa_paddlq_u32(v0);
1177 return (
int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2);
1180 {
return v_signmask(v_reinterpret_as_u32(a)); }
1182 {
return v_signmask(v_reinterpret_as_u32(a)); }
1186 v2u64 v0 = msa_shrq_n_u64(a.val, 63);
1187 return (
int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1);
1190 {
return v_signmask(v_reinterpret_as_u64(a)); }
1192 {
return v_signmask(v_reinterpret_as_u64(a)); }
1205 #define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \
1206 inline bool v_check_all(const v_##_Tpvec& a) \
1208 _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \
1209 v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
1210 return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \
1212 inline bool v_check_any(const v_##_Tpvec& a) \
1214 _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \
1215 v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
1216 return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \
1219 OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7)
1220 OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15)
1221 OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31)
1222 OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63)
1252 #define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \
1253 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1255 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \
1256 MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \
1259 OPENCV_HAL_IMPL_MSA_SELECT(
v_uint8x16, v16u8, v16u8)
1260 OPENCV_HAL_IMPL_MSA_SELECT(
v_int8x16, v16i8, v16u8)
1261 OPENCV_HAL_IMPL_MSA_SELECT(
v_uint16x8, v8u16, v16u8)
1262 OPENCV_HAL_IMPL_MSA_SELECT(
v_int16x8, v8i16, v16u8)
1263 OPENCV_HAL_IMPL_MSA_SELECT(
v_uint32x4, v4u32, v16u8)
1264 OPENCV_HAL_IMPL_MSA_SELECT(
v_int32x4, v4i32, v16u8)
1265 OPENCV_HAL_IMPL_MSA_SELECT(
v_float32x4, v4f32, v16u8)
1266 OPENCV_HAL_IMPL_MSA_SELECT(
v_float64x2, v2f64, v16u8)
1268 #define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \
1269 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1271 _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1272 _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1273 b0.val = msa_paddlq_##suffix(a_lo); \
1274 b1.val = msa_paddlq_##suffix(a_hi); \
1276 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1278 _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1279 return _Tpwvec(msa_paddlq_##suffix(a_lo)); \
1281 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1283 _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1284 return _Tpwvec(msa_paddlq_##suffix(a_hi)); \
1286 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1288 return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \
1300 return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]});
1305 return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]});
1309 #define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \
1310 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1312 b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1313 b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1315 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1317 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
1319 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1321 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
1323 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1325 c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
1326 d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
1329 OPENCV_HAL_IMPL_MSA_UNPACKS(
v_uint8x16, v16u8, v16i8, s8)
1330 OPENCV_HAL_IMPL_MSA_UNPACKS(
v_int8x16, v16i8, v16i8, s8)
1331 OPENCV_HAL_IMPL_MSA_UNPACKS(
v_uint16x8, v8u16, v8i16, s16)
1332 OPENCV_HAL_IMPL_MSA_UNPACKS(
v_int16x8, v8i16, v8i16, s16)
1333 OPENCV_HAL_IMPL_MSA_UNPACKS(
v_uint32x4, v4u32, v4i32, s32)
1334 OPENCV_HAL_IMPL_MSA_UNPACKS(
v_int32x4, v4i32, v4i32, s32)
1335 OPENCV_HAL_IMPL_MSA_UNPACKS(
v_float32x4, v4f32, v4i32, s32)
1336 OPENCV_HAL_IMPL_MSA_UNPACKS(
v_float64x2, v2f64, v2i64, s64)
1339 #define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \
1341 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1343 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \
1346 OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint8x16, v16u8, v16i8, s8)
1347 OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int8x16, v16i8, v16i8, s8)
1348 OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint16x8, v8u16, v8i16, s16)
1349 OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int16x8, v8i16, v8i16, s16)
1350 OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint32x4, v4u32, v4i32, s32)
1351 OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int32x4, v4i32, v4i32, s32)
1352 OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint64x2, v2u64, v2i64, s64)
1353 OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int64x2, v2i64, v2i64, s64)
1354 OPENCV_HAL_IMPL_MSA_EXTRACT(
v_float32x4, v4f32, v4i32, s32)
1355 OPENCV_HAL_IMPL_MSA_EXTRACT(
v_float64x2, v2f64, v2i64, s64)
1360 return v_int32x4(msa_cvttintq_s32_f32(a.val));
1365 v4i32 a1 = msa_cvttintq_s32_f32(a.val);
1366 return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val))));
1371 v4i32 a1 = msa_cvttintq_s32_f32(a.val);
1372 return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1)))));
1377 return v_int32x4(msa_cvttruncq_s32_f32(a.val));
1382 return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0)));
1387 return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val)));
1392 v2f64 a1 = msa_cvtrintq_f64(a.val);
1393 return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0)));
1398 v2f64 a1 = msa_cvtrintq_f64(a.val);
1399 return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0)));
1404 return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0)));
1407 #define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \
1408 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1409 const _Tpvec& a2, const _Tpvec& a3, \
1410 _Tpvec& b0, _Tpvec& b1, \
1411 _Tpvec& b2, _Tpvec& b3) \
1413 _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1414 _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1415 _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
1416 _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
1417 b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
1418 b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
1419 b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
1420 b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
1423 OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(
v_uint32x4, v4u32, v4i32, s32)
1424 OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(
v_int32x4, v4i32, v4i32, s32)
1425 OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(
v_float32x4, v4f32, v4i32, s32)
1427 #define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \
1428 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1430 msa_ld2q_##suffix(ptr, &a.val, &b.val); \
1432 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1434 msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \
1436 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1437 v_##_Tpvec& c, v_##_Tpvec& d) \
1439 msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \
1441 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1442 hal::StoreMode =hal::STORE_UNALIGNED) \
1444 msa_st2q_##suffix(ptr, a.val, b.val); \
1446 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1447 const v_##_Tpvec& c, hal::StoreMode =hal::STORE_UNALIGNED) \
1449 msa_st3q_##suffix(ptr, a.val, b.val, c.val); \
1451 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1452 const v_##_Tpvec& c, const v_##_Tpvec& d, \
1453 hal::StoreMode =hal::STORE_UNALIGNED ) \
1455 msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \
1458 OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16,
uchar, u8)
1459 OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16,
schar, s8)
1460 OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8,
ushort, u16)
1461 OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8,
short, s16)
1462 OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4,
unsigned, u32)
1463 OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4,
int, s32)
1464 OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4,
float, f32)
1465 OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2,
uint64, u64)
1466 OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2,
int64, s64)
1467 OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2,
double, f64)
1477 return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f)));
1482 return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val));
1487 return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
1492 return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
1617 return v_int16x8(msa_combine_s16(msa_ld1_s16(tab +
idx[0]), msa_ld1_s16(tab +
idx[1])));
1636 return v_int32x4(msa_combine_s32(msa_ld1_s32(tab +
idx[0]), msa_ld1_s32(tab +
idx[1])));
1648 return v_int64x2(msa_combine_s64(msa_create_s64(tab[
idx[0]]), msa_create_s64(tab[
idx[1]])));
1654 inline v_uint64x2 v_lut(
const uint64_t* tab,
const int*
idx) {
return v_reinterpret_as_u64(
v_lut((
const int64_t *)tab,
idx)); }
1675 return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems)));
1694 tab[msa_getq_lane_s32(idxvec.val, 0)],
1695 tab[msa_getq_lane_s32(idxvec.val, 1)],
1696 tab[msa_getq_lane_s32(idxvec.val, 2)],
1697 tab[msa_getq_lane_s32(idxvec.val, 3)]
1715 v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab +
idx[0]), msa_ld1_f32(tab +
idx[2]));
1716 v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab +
idx[1]), msa_ld1_f32(tab +
idx[3]));
1717 x =
v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
1718 y =
v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
1723 v_int8x16 c =
v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val));
1730 v_int8x16 c =
v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val));
1737 v_int16x8 c =
v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val));
1745 v_int16x8 c =
v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val));
1754 c.val[0] = vec.val[0];
1755 c.val[1] = vec.val[2];
1756 c.val[2] = vec.val[1];
1757 c.val[3] = vec.val[3];
1766 v_int8x16 c =
v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val));
1774 v_int16x8 c =
v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val));
1811 v2f64 xy0 = msa_ld1q_f64(tab +
idx[0]);
1812 v2f64 xy1 = msa_ld1q_f64(tab +
idx[1]);
1813 x =
v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
1814 y =
v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
1817 template<
int i, typename _Tp>
1818 inline typename _Tp::lane_type
v_extract_n(const _Tp& a)
1820 return v_rotate_right<i>(a).get0();
1826 return v_setall_u32(v_extract_n<i>(a));
1831 return v_setall_s32(v_extract_n<i>(a));
1836 return v_setall_f32(v_extract_n<i>(a));
1844 v4f16 v = (v4f16)msa_ld1_s16((
const short*)ptr);
1846 v4f16 v = msa_ld1_f16((
const __fp16*)ptr);
1853 v4f16 hv = msa_cvt_f16_f32(v.val);
1856 msa_st1_s16((
short*)ptr, (int16x4_t)hv);
1858 msa_st1_f16((__fp16*)ptr, hv);
1865 for(
int i = 0; i < 4; i++ )
1866 buf[i] = (
float)ptr[i];
1874 for(
int i = 0; i < 4; i++ )
1875 ptr[i] = (hfloat)buf[i];
1881 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
#define CV_BIG_UINT(n)
Definition: interface.h:64
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition: intrin_cpp.hpp:1584
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition: intrin_cpp.hpp:2190
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition: intrin_cpp.hpp:1020
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition: intrin_cpp.hpp:1033
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437