5 #ifndef OPENCV_HAL_INTRIN_LSX_HPP
6 #define OPENCV_HAL_INTRIN_LSX_HPP
11 #define CV_SIMD128_64F 1
12 #define CV_SIMD128_FP16 0
19 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
23 inline __m128i _v128_setr_b(
char v0,
char v1,
char v2,
char v3,
char v4,
char v5,
char v6,
24 char v7,
char v8,
char v9,
char v10,
char v11,
char v12,
char v13,
char v14,
char v15)
26 return (__m128i)v16i8{ v0, v1, v2, v3, v4, v5, v6, v7,
27 v8, v9, v10, v11, v12, v13, v14, v15 };
30 inline __m128i _v128_set_b(
char v0,
char v1,
char v2,
char v3,
char v4,
char v5,
char v6,
31 char v7,
char v8,
char v9,
char v10,
char v11,
char v12,
char v13,
char v14,
char v15)
33 return (__m128i)v16i8{ v15, v14, v13, v12, v11, v10, v9, v8,
34 v7, v6, v5, v4, v3, v2, v1, v0 };
37 inline __m128i _v128_setr_h(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
40 return (__m128i)v8i16{ v0, v1, v2, v3, v4, v5, v6, v7 };
43 inline __m128i _v128_setr_w(
int v0,
int v1,
int v2,
int v3)
45 return (__m128i)v4i32{ v0, v1, v2, v3 };
48 inline __m128i _v128_set_w(
int v0,
int v1,
int v2,
int v3)
50 return (__m128i)v4i32{ v3, v2, v1, v0 };
53 inline __m128i _v128_setall_w(
int v0)
55 return __lsx_vreplgr2vr_w(v0);
58 inline __m128i _v128_setr_d(
int64 v0,
int64 v1)
60 return (__m128i)v2i64{ v0, v1 };
65 return (__m128i)v2i64{ v1, v0 };
68 inline __m128 _v128_setr_ps(
float v0,
float v1,
float v2,
float v3)
70 return (__m128)v4f32{ v0, v1, v2, v3 };
73 inline __m128 _v128_setall_ps(
float v0)
75 return (__m128)v4f32{ v0, v0, v0, v0 };
78 inline __m128d _v128_setr_pd(
double v0,
double v1)
80 return (__m128d)v2f64{ v0, v1 };
83 inline __m128d _v128_setall_pd(
double v0)
85 return (__m128d)v2f64{ v0, v0 };
88 inline __m128i _lsx_packus_h(
const __m128i& a,
const __m128i& b)
90 return __lsx_vssrarni_bu_h(b, a, 0);
93 inline __m128i _lsx_packs_h(
const __m128i& a,
const __m128i& b)
95 return __lsx_vssrarni_b_h(b, a, 0);
98 inline __m128i _lsx_packus_w(
const __m128i& a,
const __m128i& b)
100 return __lsx_vssrarni_hu_w(b, a, 0);
107 typedef uchar lane_type;
115 val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
120 return (
uchar)__lsx_vpickve2gr_bu(val, 0);
128 typedef schar lane_type;
129 enum { nlanes = 16 };
132 explicit v_int8x16(__m128i v) : val(v) {}
136 val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
141 return (
schar)__lsx_vpickve2gr_b(val, 0);
156 val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
161 return (
ushort)__lsx_vpickve2gr_hu(val, 0);
169 typedef short lane_type;
173 explicit v_int16x8(__m128i v) : val(v) {}
174 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
176 val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
181 return (
short)__lsx_vpickve2gr_h(val, 0);
189 typedef unsigned lane_type;
194 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
196 val = _v128_setr_w(v0, v1, v2, v3);
199 unsigned get0()
const
201 return (
unsigned)__lsx_vpickve2gr_wu(val, 0);
209 typedef int lane_type;
213 explicit v_int32x4(__m128i v) : val(v) {}
214 v_int32x4(
int v0,
int v1,
int v2,
int v3)
216 val = _v128_setr_w(v0, v1, v2, v3);
221 return (
int)__lsx_vpickve2gr_w(val, 0);
229 typedef float lane_type;
234 explicit v_float32x4(__m128i v) { val = *((__m128*)&v); }
235 v_float32x4(
float v0,
float v1,
float v2,
float v3)
237 val = _v128_setr_ps(v0, v1, v2, v3);
242 union {
int iv;
float fv; } d;
243 d.iv = __lsx_vpickve2gr_w(val, 0);
247 int get0toint()
const
249 __m128i
result = __lsx_vftintrz_w_s(val);
250 return (
int)__lsx_vpickve2gr_w(
result, 0);
265 val = _v128_setr_d(v0, v1);
270 return __lsx_vpickve2gr_du(val, 0);
278 typedef int64 lane_type;
282 explicit v_int64x2(__m128i v) : val(v) {}
285 val = _v128_setr_d(v0, v1);
290 return __lsx_vpickve2gr_d(val, 0);
298 typedef double lane_type;
303 explicit v_float64x2(__m128i v) { val = *((__m128d*)&v); }
306 val = _v128_setr_pd(v0, v1);
311 union {
int64 iv;
double fv; } d;
312 d.iv = __lsx_vpickve2gr_d(val, 0);
316 int64 get0toint64()
const
318 __m128i
result = __lsx_vftintrz_l_d(val);
327 #define OPENCV_HAL_IMPL_LSX_LOADSTORE(_Tpvec, _Tp) \
328 inline _Tpvec v_load(const _Tp* ptr) \
329 { return _Tpvec(__lsx_vld(ptr, 0)); } \
330 inline _Tpvec v_load_aligned(const _Tp* ptr) \
331 { return _Tpvec(__lsx_vld(ptr, 0)); } \
332 inline _Tpvec v_load_low(const _Tp* ptr) \
333 { return _Tpvec(__lsx_vldrepl_d(ptr, 0)); } \
334 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
336 __m128i vl = __lsx_vldrepl_d(ptr0, 0); \
337 __m128i vh = __lsx_vldrepl_d(ptr1, 0); \
338 return _Tpvec(__lsx_vilvl_d(vh, vl)); \
340 inline void v_store(_Tp* ptr, const _Tpvec& a) \
341 { __lsx_vst(a.val, ptr, 0); } \
342 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
343 { __lsx_vst(a.val, ptr, 0); } \
344 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
345 { __lsx_vst(a.val, ptr, 0); } \
346 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
348 if ( mode == hal::STORE_UNALIGNED) \
349 __lsx_vst(a.val, ptr, 0); \
350 else if ( mode == hal::STORE_ALIGNED_NOCACHE) \
351 __lsx_vst(a.val, ptr, 0); \
353 __lsx_vst(a.val, ptr, 0); \
355 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
356 { __lsx_vstelm_d(a.val, ptr, 0, 0); } \
357 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
358 { __lsx_vstelm_d(a.val, ptr, 0, 1); } \
363 OPENCV_HAL_IMPL_LSX_LOADSTORE(
v_int16x8,
short)
364 OPENCV_HAL_IMPL_LSX_LOADSTORE(
v_uint32x4,
unsigned)
365 OPENCV_HAL_IMPL_LSX_LOADSTORE(
v_int32x4,
int)
369 #define OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg) \
370 inline _Tpvec v_load(const _Tp* ptr) \
371 { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); } \
372 inline _Tpvec v_load_aligned(const _Tp* ptr) \
373 { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); } \
374 inline _Tpvec v_load_low(const _Tp* ptr) \
375 { return _Tpvec((halfreg)__lsx_vldrepl_d(ptr, 0)); } \
376 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
378 __m128i vl = __lsx_vldrepl_d(ptr0, 0); \
379 __m128i vh = __lsx_vldrepl_d(ptr1, 0); \
380 return _Tpvec((halfreg)__lsx_vilvl_d(vh, vl)); \
382 inline void v_store(_Tp* ptr, const _Tpvec& a) \
383 { __lsx_vst((__m128i)a.val, ptr, 0); } \
384 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
385 { __lsx_vst((__m128i)a.val, ptr, 0); } \
386 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
387 { __lsx_vst((__m128i)a.val, ptr, 0); } \
388 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
390 if( mode == hal::STORE_UNALIGNED) \
391 __lsx_vst((__m128i)a.val, ptr, 0); \
392 else if( mode == hal::STORE_ALIGNED_NOCACHE) \
393 __lsx_vst((__m128i)a.val, ptr, 0); \
395 __lsx_vst((__m128i)a.val, ptr, 0); \
397 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
398 { __lsx_vstelm_d((__m128i)a.val, ptr, 0, 0); } \
399 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
400 { __lsx_vstelm_d((__m128i)a.val, ptr, 0, 1); } \
402 OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(
v_float32x4,
float, __m128)
403 OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(
v_float64x2,
double, __m128d)
405 inline __m128i _lsx_128_castps_si128(
const __m128& v)
406 {
return __m128i(v); }
408 inline __m128i _lsx_128_castpd_si128(
const __m128d& v)
409 {
return __m128i(v); }
411 #define OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
412 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
413 { return _Tpvec(cast(a.val)); }
415 #define OPENCV_HAL_IMPL_LSX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
416 inline _Tpvec v_setzero_##suffix() \
417 { return _Tpvec(__lsx_vldi(0)); } \
418 inline _Tpvec v_setall_##suffix(_Tp v) \
419 { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); } \
420 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, OPENCV_HAL_NOP) \
421 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, OPENCV_HAL_NOP) \
422 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, OPENCV_HAL_NOP) \
423 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8, suffix, OPENCV_HAL_NOP) \
424 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4, suffix, OPENCV_HAL_NOP) \
425 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4, suffix, OPENCV_HAL_NOP) \
426 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2, suffix, OPENCV_HAL_NOP) \
427 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2, suffix, OPENCV_HAL_NOP) \
428 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float32x4, suffix, _lsx_128_castps_si128) \
429 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float64x2, suffix, _lsx_128_castpd_si128) \
434 OPENCV_HAL_IMPL_LSX_INIT(
v_int16x8,
short, s16, h,
int)
435 OPENCV_HAL_IMPL_LSX_INIT(
v_uint32x4,
unsigned, u32, w,
int)
436 OPENCV_HAL_IMPL_LSX_INIT(
v_int32x4,
int, s32, w,
int)
440 inline __m128 _lsx_128_castsi128_ps(
const __m128i &v)
441 {
return __m128(v); }
443 inline __m128d _lsx_128_castsi128_pd(
const __m128i &v)
444 {
return __m128d(v); }
446 #define OPENCV_HAL_IMPL_LSX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
447 inline _Tpvec v_setzero_##suffix() \
448 { return _Tpvec(__lsx_vldi(0)); } \
449 inline _Tpvec v_setall_##suffix(_Tp v) \
450 { return _Tpvec(_v128_setall_##zsuffix(v)); } \
451 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, cast) \
452 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, cast) \
453 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, cast) \
454 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8, suffix, cast) \
455 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4, suffix, cast) \
456 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4, suffix, cast) \
457 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2, suffix, cast) \
458 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2, suffix, cast) \
460 OPENCV_HAL_IMPL_LSX_INIT_FLT(
v_float32x4,
float, f32, ps, _lsx_128_castsi128_ps)
461 OPENCV_HAL_IMPL_LSX_INIT_FLT(
v_float64x2,
double, f64, pd, _lsx_128_castsi128_pd)
466 {
return v_float32x4(_lsx_128_castps_si128(__m128(a.val))); }
471 {
return v_float64x2(_lsx_128_castpd_si128(__m128d(a.val))); }
476 #define OPENCV_HAL_IMPL_LSX_UNPACK(_Tpvec, suffix) \
477 inline _Tpvec v128_unpacklo(const _Tpvec& a, const _Tpvec& b) \
478 { return _Tpvec(__lsx_vilvl_##suffix(__m128i(b.val), __m128i(a.val))); } \
479 inline _Tpvec v128_unpackhi(const _Tpvec& a, const _Tpvec& b) \
480 { return _Tpvec(__lsx_vilvh_##suffix(__m128i(b.val), __m128i(a.val))); } \
494 #define OPENCV_HAL_IMPL_LSX_ZIP(_Tpvec) \
495 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
496 { return (_Tpvec)__lsx_vilvl_d((__m128i)b.val, (__m128i)a.val); } \
497 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
498 { return (_Tpvec)__lsx_vilvh_d((__m128i)b.val, (__m128i)a.val); } \
499 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
500 _Tpvec& c, _Tpvec& d) \
502 __m128i a1 = (__m128i)a.val, b1 = (__m128i)b.val; \
503 c = _Tpvec(__lsx_vilvl_d(b1, a1)); \
504 d = _Tpvec(__lsx_vilvh_d(b1, a1)); \
506 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
507 _Tpvec& ab0, _Tpvec& ab1) \
509 ab0 = v128_unpacklo(a, b); \
510 ab1 = v128_unpackhi(a, b); \
527 #define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin) \
528 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
529 { return _Tpvec(intrin(a.val, b.val)); } \
530 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
531 { a.val = intrin(a.val, b.val); return a; }
533 OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_uint8x16, __lsx_vsadd_bu)
534 OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_uint8x16, __lsx_vssub_bu)
535 OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_int8x16, __lsx_vsadd_b)
536 OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_int8x16, __lsx_vssub_b)
537 OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_uint16x8, __lsx_vsadd_hu)
538 OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_uint16x8, __lsx_vssub_hu)
539 OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_int16x8, __lsx_vsadd_h)
540 OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_int16x8, __lsx_vssub_h)
541 OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_uint32x4, __lsx_vadd_w)
542 OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_uint32x4, __lsx_vsub_w)
543 OPENCV_HAL_IMPL_LSX_BIN_OP(*,
v_uint32x4, __lsx_vmul_w)
544 OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_int32x4, __lsx_vadd_w)
545 OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_int32x4, __lsx_vsub_w)
546 OPENCV_HAL_IMPL_LSX_BIN_OP(*,
v_int32x4, __lsx_vmul_w)
547 OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_uint64x2, __lsx_vadd_d)
548 OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_uint64x2, __lsx_vsub_d)
549 OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_int64x2, __lsx_vadd_d)
550 OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_int64x2, __lsx_vsub_d)
552 OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_float32x4, __lsx_vfadd_s)
553 OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_float32x4, __lsx_vfsub_s)
554 OPENCV_HAL_IMPL_LSX_BIN_OP(*,
v_float32x4, __lsx_vfmul_s)
555 OPENCV_HAL_IMPL_LSX_BIN_OP(/,
v_float32x4, __lsx_vfdiv_s)
556 OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_float64x2, __lsx_vfadd_d)
557 OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_float64x2, __lsx_vfsub_d)
558 OPENCV_HAL_IMPL_LSX_BIN_OP(*,
v_float64x2, __lsx_vfmul_d)
559 OPENCV_HAL_IMPL_LSX_BIN_OP(/,
v_float64x2, __lsx_vfdiv_d)
576 __m128i a0 = a.val, b0 = b.val;
577 __m128i pev = __lsx_vmulwev_w_hu(a0, b0);
578 __m128i pod = __lsx_vmulwod_w_hu(a0, b0);
579 __m128i pl = __lsx_vilvl_w(pod, pev);
580 __m128i ph = __lsx_vilvh_w(pod, pev);
581 return (
v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
585 __m128i a0 = a.val, b0 = b.val;
586 __m128i pev = __lsx_vmulwev_w_h(a0, b0);
587 __m128i pod = __lsx_vmulwod_w_h(a0, b0);
588 __m128i pl = __lsx_vilvl_w(pod, pev);
589 __m128i ph = __lsx_vilvh_w(pod, pev);
590 return (
v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
593 { a = a * b;
return a; }
595 { a = a * b;
return a; }
597 { a = a * b;
return a; }
599 { a = a * b;
return a; }
603 #define OPENCV_HAL_IMPL_LSX_BIN_FUNC(func, _Tpvec, intrin) \
604 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
605 { return _Tpvec(intrin(a.val, b.val)); } \
607 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap,
v_uint8x16, __lsx_vadd_b)
608 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap,
v_int8x16, __lsx_vadd_b)
609 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap,
v_uint16x8, __lsx_vadd_h)
610 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap,
v_int16x8, __lsx_vadd_h)
611 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap,
v_uint8x16, __lsx_vsub_b)
612 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap,
v_int8x16, __lsx_vsub_b)
613 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap,
v_uint16x8, __lsx_vsub_h)
614 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap,
v_int16x8, __lsx_vsub_h)
615 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap,
v_uint16x8, __lsx_vmul_h)
616 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap,
v_int16x8, __lsx_vmul_h)
620 __m128i a0 = a.val, b0 = b.val;
621 __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
622 __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
628 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
635 __m128i a0 = a.val, b0 = b.val;
636 __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
637 __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
638 c.val = __lsx_vilvl_h(p1, p0);
639 d.val = __lsx_vilvh_h(p1, p0);
644 __m128i a0 = a.val, b0 = b.val;
645 __m128i p0 = __lsx_vmulwev_h_b(a0, b0);
646 __m128i p1 = __lsx_vmulwod_h_b(a0, b0);
647 c.val = __lsx_vilvl_h(p1, p0);
648 d.val = __lsx_vilvh_h(p1, p0);
653 __m128i a0 = a.val, b0 = b.val;
654 __m128i p0 = __lsx_vmulwev_w_h(a0, b0);
655 __m128i p1 = __lsx_vmulwod_w_h(a0, b0);
656 c.val = __lsx_vilvl_w(p1, p0);
657 d.val = __lsx_vilvh_w(p1, p0);
662 __m128i a0 = a.val, b0 = b.val;
663 __m128i p0 = __lsx_vmulwev_w_hu(a0, b0);
664 __m128i p1 = __lsx_vmulwod_w_hu(a0, b0);
665 c.val = __lsx_vilvl_w(p1, p0);
666 d.val = __lsx_vilvh_w(p1, p0);
671 __m128i a0 = a.val, b0 = b.val;
672 __m128i p0 = __lsx_vmulwev_d_wu(a0, b0);
673 __m128i p1 = __lsx_vmulwod_d_wu(a0, b0);
674 c.val = __lsx_vilvl_d(p1, p0);
675 d.val = __lsx_vilvh_d(p1, p0);
678 {
return v_int16x8(__lsx_vmuh_h(a.val, b.val)); }
680 {
return v_uint16x8(__lsx_vmuh_hu(a.val, b.val)); }
683 #define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
684 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
685 { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
686 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
687 { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
688 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
689 { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
690 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
691 { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
693 inline _Tpuvec v_shl(const _Tpuvec& a) \
694 { return _Tpuvec(__lsx_vslli_##suffix(a.val, imm)); } \
696 inline _Tpsvec v_shl(const _Tpsvec& a) \
697 { return _Tpsvec(__lsx_vslli_##suffix(a.val, imm)); } \
699 inline _Tpuvec v_shr(const _Tpuvec& a) \
700 { return _Tpuvec(__lsx_vsrli_##suffix(a.val, imm)); } \
702 inline _Tpsvec v_shr(const _Tpsvec& a) \
703 { return _Tpsvec(__lsx_vsrai_##suffix(a.val, imm)); } \
710 #define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix) \
711 OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix) \
712 OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix) \
713 OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix) \
714 inline _Tpvec operator ~(const _Tpvec& a) \
715 { return _Tpvec(__lsx_vnori_b(a.val, 0)); } \
718 OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int8x16, v)
720 OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int16x8, v)
722 OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int32x4, v)
724 OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int64x2, v)
726 #define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
727 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
728 { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } \
729 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
730 { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val); \
734 #define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast) \
735 OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast) \
736 OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast) \
737 OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast) \
738 inline _Tpvec operator ~ (const _Tpvec& a) \
739 { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); } \
741 OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(
v_float32x4, _lsx_128_castsi128_ps)
742 OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(
v_float64x2, _lsx_128_castsi128_pd)
745 #define OPENCV_HAL_IMPL_LSX_SELECT(_Tpvec) \
746 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
747 { return _Tpvec(__lsx_vbitsel_v(b.val, a.val, mask.val)); } \
757 {
return v_float32x4(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)
mask.val)); }
759 {
return v_float64x2(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)
mask.val)); }
762 #define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec) \
763 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
764 { return ~( a == b ); } \
765 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
767 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
768 { return ~(a < b); } \
769 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
772 #define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
773 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
774 { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); } \
775 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
776 { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); } \
777 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
778 { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); } \
779 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
780 { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); } \
781 OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec) \
782 OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
788 #define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix) \
789 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
790 { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); } \
791 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
792 { return ~(a == b); }
794 OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(
v_uint64x2, d)
795 OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(
v_int64x2, d)
797 #define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
798 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
799 { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); } \
801 #define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix) \
802 OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix) \
803 OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix) \
804 OPENCV_HAL_IMPL_LSX_CMP_FLT(<, vfcmp_clt, _Tpvec, ssuffix) \
805 OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix) \
811 {
return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
814 {
return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
817 {
return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
820 {
return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
823 {
return v_float32x4(__lsx_vfcmp_cor_s(a.val, a.val)); }
826 {
return v_float64x2(__lsx_vfcmp_cor_d(a.val, a.val)); }
829 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_uint8x16, __lsx_vmin_bu)
830 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_uint8x16, __lsx_vmax_bu)
831 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_int8x16, __lsx_vmin_b)
832 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_int8x16, __lsx_vmax_b)
833 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_uint16x8, __lsx_vmin_hu)
834 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_uint16x8, __lsx_vmax_hu)
835 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_int16x8, __lsx_vmin_h)
836 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_int16x8, __lsx_vmax_h)
837 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_uint32x4, __lsx_vmin_wu)
838 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_uint32x4, __lsx_vmax_wu)
839 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_int32x4, __lsx_vmin_w)
840 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_int32x4, __lsx_vmax_w)
841 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_float32x4, __lsx_vfmin_s)
842 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_float32x4, __lsx_vfmax_s)
843 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_float64x2, __lsx_vfmin_d)
844 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_float64x2, __lsx_vfmax_d)
847 bool is_invalid = ((imm < 0) || (imm > 16)),
848 bool is_first = (imm == 0),
849 bool is_half = (imm == 8),
850 bool is_second = (imm == 16),
851 bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
852 class v_lsx_palignr_u8_class;
855 class v_lsx_palignr_u8_class<imm, true, false, false, false, false>;
858 class v_lsx_palignr_u8_class<imm, false, true, false, false, false>
861 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
869 class v_lsx_palignr_u8_class<imm, false, false, true, false, false>
872 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
874 return __lsx_vshuf4i_d(a, b, 0x9);
879 class v_lsx_palignr_u8_class<imm, false, false, false, true, false>
882 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
890 class v_lsx_palignr_u8_class<imm, false, false, false, false, true>
893 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
895 enum { imm2 = (
sizeof(__m128i) - imm) };
896 return __lsx_vor_v(__lsx_vbsrl_v(a, imm), __lsx_vbsll_v(b, imm2));
901 inline __m128i v_lsx_palignr_u8(
const __m128i& a,
const __m128i& b)
903 CV_StaticAssert((imm >= 0) && (imm <= 16),
"Invalid imm for v_lsx_palignr_u8");
904 return v_lsx_palignr_u8_class<imm>()(a, b);
907 #define OPENCV_HAL_IMPL_LSX_ROTATE_CAST(_Tpvec, cast) \
909 inline _Tpvec v_rotate_right(const _Tpvec &a) \
911 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
912 __m128i ret = __lsx_vbsrl_v((__m128i)a.val, imm2); \
913 return _Tpvec(cast(ret)); \
916 inline _Tpvec v_rotate_left(const _Tpvec &a) \
918 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
919 __m128i ret = __lsx_vbsll_v((__m128i)a.val, imm2); \
920 return _Tpvec(cast(ret)); \
923 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
925 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
926 return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)a.val, (__m128i)b.val))); \
929 inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
931 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type))}; \
932 return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)b.val, (__m128i)a.val))); \
935 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint8x16, OPENCV_HAL_NOP) \
936 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int8x16, OPENCV_HAL_NOP) \
937 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint16x8, OPENCV_HAL_NOP) \
938 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int16x8, OPENCV_HAL_NOP) \
939 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint32x4, OPENCV_HAL_NOP) \
940 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int32x4, OPENCV_HAL_NOP) \
941 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint64x2, OPENCV_HAL_NOP) \
942 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int64x2, OPENCV_HAL_NOP) \
944 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_float32x4, _lsx_128_castsi128_ps)
945 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_float64x2, _lsx_128_castsi128_pd)
950 __m128i vec = __lsx_vshuf4i_b(a.val, 0x1B);
951 return v_uint8x16(__lsx_vshuf4i_w(vec, 0x1B));
955 {
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
959 __m128i vec = __lsx_vshuf4i_h(a.val, 0x1B);
960 return v_uint16x8(__lsx_vshuf4i_w(vec, 0x4E));
964 {
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
967 {
return v_uint32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
970 {
return v_int32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
973 {
return v_uint64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
976 {
return v_int64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
979 {
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
982 {
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
990 __m128i t1 = __lsx_vhaddw_hu_bu(a.val, a.val);
991 __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
992 __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
993 __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
994 return (
unsigned)__lsx_vpickve2gr_w(t4, 0);
999 __m128i t1 = __lsx_vhaddw_h_b(a.val, a.val);
1000 __m128i t2 = __lsx_vhaddw_w_h(t1, t1);
1001 __m128i t3 = __lsx_vhaddw_d_w(t2, t2);
1002 __m128i t4 = __lsx_vhaddw_q_d(t3, t3);
1003 return (
int)__lsx_vpickve2gr_w(t4, 0);
1006 #define OPENCV_HAL_IMPL_LSX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1007 inline sctype v_reduce_##func(const _Tpvec& a) \
1009 __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1010 val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1011 val = intrin(val, __lsx_vbsrl_v(val, 2)); \
1012 val = intrin(val, __lsx_vbsrl_v(val, 1)); \
1013 return (sctype)__lsx_vpickve2gr_b(val, 0); \
1021 #define OPENCV_HAL_IMPL_LSX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1022 inline sctype v_reduce_##func(const _Tpvec &a) \
1024 __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1025 val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1026 val = intrin(val, __lsx_vbsrl_v(val, 2)); \
1027 return (sctype)__lsx_vpickve2gr_h(val, 0); \
1032 OPENCV_HAL_IMPL_LSX_REDUCE_8(
v_int16x8,
short,
min, __lsx_vmin_h)
1033 OPENCV_HAL_IMPL_LSX_REDUCE_8(
v_int16x8,
short,
max, __lsx_vmax_h)
1035 #define OPENCV_HAL_IMPL_LSX_REDUCE_4(_Tpvec, sctype, func, intrin) \
1036 inline sctype v_reduce_##func(const _Tpvec &a) \
1038 __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1039 val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1040 return (sctype)__lsx_vpickve2gr_w(val, 0); \
1043 OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_uint32x4,
unsigned,
min, __lsx_vmin_wu)
1044 OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_uint32x4,
unsigned,
max, __lsx_vmax_wu)
1045 OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_int32x4,
int,
min, __lsx_vmin_w)
1046 OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_int32x4,
int,
max, __lsx_vmax_w)
1048 #define OPENCV_HAL_IMPL_LSX_REDUCE_FLT(func, intrin) \
1049 inline float v_reduce_##func(const v_float32x4 &a) \
1051 __m128 val = a.val; \
1052 val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 8)); \
1053 val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 4)); \
1054 float *fval = (float*)&val; \
1058 OPENCV_HAL_IMPL_LSX_REDUCE_FLT(
min, __lsx_vfmin_s)
1059 OPENCV_HAL_IMPL_LSX_REDUCE_FLT(
max, __lsx_vfmax_s)
1063 __m128i t1 = __lsx_vhaddw_d_w(a.val, a.val);
1064 __m128i t2 = __lsx_vhaddw_q_d(t1, t1);
1065 return (
int)__lsx_vpickve2gr_w(t2, 0);
1070 __m128i t1 = __lsx_vhaddw_du_wu(a.val, a.val);
1071 __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1072 return (
int)__lsx_vpickve2gr_w(t2, 0);
1077 __m128i t1 = __lsx_vhaddw_w_h(a.val, a.val);
1078 __m128i t2 = __lsx_vhaddw_d_w(t1, t1);
1079 __m128i t3 = __lsx_vhaddw_q_d(t2, t2);
1080 return (
int)__lsx_vpickve2gr_w(t3, 0);
1085 __m128i t1 = __lsx_vhaddw_wu_hu(a.val, a.val);
1086 __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1087 __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1088 return (
int)__lsx_vpickve2gr_w(t3, 0);
1093 __m128i val = (__m128i)a.val;
1094 val = __lsx_vbsrl_v(val, 8);
1095 __m128
result = __lsx_vfadd_s(a.val, (__m128)val);
1096 float *pa = (
float*)&
result;
1097 return (
float)(pa[0] + pa[1]);
1102 __m128i t0 = __lsx_vhaddw_qu_du(a.val, a.val);
1103 return (
uint64)__lsx_vpickve2gr_du(t0, 0);
1108 __m128i t0 = __lsx_vhaddw_q_d(a.val, a.val);
1109 return (
int64)__lsx_vpickve2gr_d(t0, 0);
1114 double *pa = (
double*)&a;
1115 return pa[0] + pa[1];
1121 __m128i a0 = (__m128i)a.val;
1122 __m128i b0 = (__m128i)b.val;
1123 __m128i c0 = (__m128i)c.val;
1124 __m128i d0 = (__m128i)d.val;
1125 __m128i ac_l = __lsx_vilvl_w(c0, a0);
1126 __m128i ac_h = __lsx_vilvh_w(c0, a0);
1127 __m128i bd_l = __lsx_vilvl_w(d0, b0);
1128 __m128i bd_h = __lsx_vilvh_w(d0, b0);
1129 __m128 ac = __lsx_vfadd_s((__m128)ac_l, (__m128)ac_h);
1130 __m128 bd = __lsx_vfadd_s((__m128)bd_l, (__m128)bd_h);
1131 return v_float32x4(__lsx_vfadd_s((__m128)__lsx_vilvl_w((__m128i)bd, (__m128i)ac),
1132 (__m128)__lsx_vilvh_w((__m128i)bd, (__m128i)ac)));
1137 __m128i t0 = __lsx_vabsd_b(a.val, b.val);
1138 __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
1139 __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
1140 __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
1141 __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
1142 return (
unsigned)__lsx_vpickve2gr_w(t4, 0);
1147 __m128i t0 = __lsx_vabsd_bu(a.val, b.val);
1148 __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
1149 __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
1150 __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
1151 __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
1152 return (
unsigned)__lsx_vpickve2gr_w(t4, 0);
1157 __m128i t0 = __lsx_vabsd_hu(a.val, b.val);
1158 __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
1159 __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1160 __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1161 return (
unsigned)__lsx_vpickve2gr_w(t3, 0);
1166 __m128i t0 = __lsx_vabsd_h(a.val, b.val);
1167 __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
1168 __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1169 __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1170 return (
unsigned)__lsx_vpickve2gr_w(t3, 0);
1175 __m128i t0 = __lsx_vabsd_wu(a.val, b.val);
1176 __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
1177 __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1178 return (
unsigned)__lsx_vpickve2gr_w(t2, 0);
1183 __m128i t0 = __lsx_vabsd_w(a.val, b.val);
1184 __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
1185 __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1186 return (
unsigned)__lsx_vpickve2gr_w(t2, 0);
1196 #define OPENCV_HAL_IMPL_LSX_POPCOUNT(_Tpvec, _Tp, suffix) \
1197 inline _Tpvec v_popcount(const _Tp& a) \
1198 { return _Tpvec(__lsx_vpcnt_##suffix(a.val)); }
1210 #define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
1211 inline tt reinterpret_int(ft x) { union {ft l; tt i;} v; v.l = x; return v.i; }
1212 OPENCV_HAL_IMPL_REINTERPRET_INT(
uchar,
schar)
1213 OPENCV_HAL_IMPL_REINTERPRET_INT(
schar,
schar)
1214 OPENCV_HAL_IMPL_REINTERPRET_INT(
ushort,
short)
1215 OPENCV_HAL_IMPL_REINTERPRET_INT(
short,
short)
1216 OPENCV_HAL_IMPL_REINTERPRET_INT(
unsigned,
int)
1217 OPENCV_HAL_IMPL_REINTERPRET_INT(
int,
int)
1218 OPENCV_HAL_IMPL_REINTERPRET_INT(
float,
int)
1220 OPENCV_HAL_IMPL_REINTERPRET_INT(
int64,
int64)
1221 OPENCV_HAL_IMPL_REINTERPRET_INT(
double,
int64)
1225 __m128i
result = __lsx_vmskltz_b(a.val);
1226 return __lsx_vpickve2gr_w(
result, 0);
1229 {
return v_signmask(v_reinterpret_as_s8(a)) ;}
1233 __m128i
result = __lsx_vmskltz_h(a.val);
1234 return __lsx_vpickve2gr_w(
result, 0);
1237 {
return v_signmask(v_reinterpret_as_s16(a)); }
1241 __m128i
result = __lsx_vmskltz_w(a.val);
1242 return __lsx_vpickve2gr_w(
result, 0);
1245 {
return v_signmask(v_reinterpret_as_u32(a)); }
1249 __m128i
result = __lsx_vmskltz_d(a.val);
1250 return __lsx_vpickve2gr_w(
result, 0);
1253 {
return v_signmask(v_reinterpret_as_u64(a)); }
1273 #define OPENCV_HAL_IMPL_LSX_CHECK(_Tpvec, allmask) \
1274 inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1275 inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1277 OPENCV_HAL_IMPL_LSX_CHECK(
v_int8x16, 65535)
1279 OPENCV_HAL_IMPL_LSX_CHECK(
v_int16x8, 255);
1281 OPENCV_HAL_IMPL_LSX_CHECK(
v_int32x4, 15)
1290 #define OPENCV_HAL_IMPL_LSX_MULADD(_Tpvec, suffix) \
1291 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1292 { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); } \
1293 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec &b, const _Tpvec& c) \
1294 { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); } \
1295 inline _Tpvec v_sqrt(const _Tpvec& x) \
1296 { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); } \
1297 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1298 { return v_fma(a, a, b * b); } \
1299 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1300 { return v_sqrt(v_fma(a, a, b * b)); }
1306 {
return v_int32x4(__lsx_vmadd_w(c.val, a.val, b.val)); }
1309 {
return v_fma(a, b, c); }
1322 #define OPENCV_HAL_IMPL_LSX_ABS(_Tpvec, suffix) \
1323 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1324 { return v_u##_Tpvec(__lsx_vabsd_##suffix(x.val, __lsx_vldi(0))); }
1326 OPENCV_HAL_IMPL_LSX_ABS(int8x16, b)
1327 OPENCV_HAL_IMPL_LSX_ABS(int16x8, h)
1328 OPENCV_HAL_IMPL_LSX_ABS(int32x4, w)
1331 {
return v_float32x4(*((__m128i*)&
x) & __lsx_vreplgr2vr_w(0x7fffffff)); }
1333 {
return v_float64x2(*((__m128i*)&
x) & __lsx_vreplgr2vr_d(0x7fffffffffffffff)); }
1338 {
return (
v_uint8x16)__lsx_vabsd_bu(a.val, b.val); }
1340 {
return (
v_uint16x8)__lsx_vabsd_hu(a.val, b.val); }
1342 {
return (
v_uint32x4)__lsx_vabsd_wu(a.val, b.val); }
1345 {
return (
v_uint8x16)__lsx_vabsd_b(a.val, b.val); }
1347 {
return (
v_uint16x8)__lsx_vabsd_h(a.val, b.val); }
1349 {
return (
v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
1352 {
return v_abs(a - b); }
1355 {
return v_abs(a - b); }
1365 {
return v_max(a, b) - v_min(a, b); }
1371 {
return v_int32x4(__lsx_vftint_w_s(a.val)); }
1374 {
return v_int32x4(__lsx_vftint_w_d(a.val, a.val)); }
1377 {
return v_int32x4(__lsx_vftint_w_d(b.val, a.val)); }
1380 {
return v_int32x4(__lsx_vftintrz_w_s(a.val)); }
1383 {
return v_int32x4(__lsx_vftintrz_w_d(a.val, a.val)); }
1386 {
return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrm_s(a.val)))); }
1392 {
return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrp_s(a.val)))); }
1402 {
return v_float32x4(__lsx_vfcvt_s_d(a.val, a.val)); }
1405 {
return v_float32x4(__lsx_vfcvt_s_d(b.val, a.val)); }
1429 tab[
idx[14]], tab[
idx[15]]));
1434 return v_int8x16(_v128_setr_h(*(
const short*)(tab +
idx[0]), *(
const short*)(tab +
idx[1]),
1435 *(
const short*)(tab +
idx[2]), *(
const short*)(tab +
idx[3]), *(
const short*)(tab +
idx[4]),
1436 *(
const short*)(tab +
idx[5]), *(
const short*)(tab +
idx[6]), *(
const short*)(tab +
idx[7])));
1441 return v_int8x16(_v128_setr_w(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
1442 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3])));
1446 {
return v_reinterpret_as_u8(
v_lut((
const schar*)tab,
idx)); }
1459 return v_int16x8(_v128_setr_w(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
1460 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3])));
1464 return v_int16x8(_v128_setr_d(*(
const int64_t*)(tab +
idx[0]), *(
const int64_t*)(tab +
idx[1])));
1468 {
return v_reinterpret_as_u16(
v_lut((
const short *)tab,
idx)); }
1470 {
return v_reinterpret_as_u16(
v_lut_pairs((
const short *)tab,
idx)); }
1472 {
return v_reinterpret_as_u16(
v_lut_quads((
const short *)tab,
idx)); }
1480 return v_int32x4(_v128_setr_d(*(
const int64_t*)(tab +
idx[0]), *(
const int64_t*)(tab +
idx[1])));
1500 inline v_uint64x2 v_lut(
const uint64_t* tab,
const int*
idx) {
return v_reinterpret_as_u64(
v_lut((
const int64_t *)tab,
idx)); }
1509 return v_float32x4((__m128)_v128_setr_pd(*(
const double*)(tab +
idx[0]), *(
const double*)(tab +
idx[1])));
1527 int *
idx = (
int*)&idxvec.val;
1533 return v_reinterpret_as_u32(
v_lut((
const int *)tab, idxvec));
1538 const int *
idx = (
const int*)&idxvec.val;
1544 const int *
idx = (
const int*)&idxvec.val;
1550 const int *
idx = (
const int*)&idxvec.val;
1551 __m128i xy0 = __lsx_vld(tab +
idx[0], 0);
1552 __m128i xy1 = __lsx_vld(tab +
idx[1], 0);
1553 __m128i xy2 = __lsx_vld(tab +
idx[2], 0);
1554 __m128i xy3 = __lsx_vld(tab +
idx[3], 0);
1555 __m128i xy01 = __lsx_vilvl_d(xy1, xy0);
1556 __m128i xy23 = __lsx_vilvl_d(xy3, xy2);
1557 __m128i xxyy02 = __lsx_vilvl_w(xy23, xy01);
1558 __m128i xxyy13 = __lsx_vilvh_w(xy23, xy01);
1559 x =
v_float32x4((__m128)__lsx_vilvl_w(xxyy13, xxyy02));
1560 y =
v_float32x4((__m128)__lsx_vilvh_w(xxyy13, xxyy02));
1565 const int*
idx = (
const int*)&idxvec.val;
1566 __m128i xy0 = __lsx_vld(tab +
idx[0], 0);
1567 __m128i xy1 = __lsx_vld(tab +
idx[1], 0);
1574 return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
1575 _v128_setr_d(0x0705060403010200, 0x0f0d0e0c0b090a08)));
1581 return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
1582 _v128_setr_d(0x0703060205010400, 0x0f0b0e0a0d090c08)));
1589 return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
1590 _v128_setr_d(0x0706030205040100, 0x0f0e0b0a0d0c0908)));
1596 return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
1597 _v128_setr_d(0x0b0a030209080100, 0x0f0e07060d0c0504)));
1604 return v_int32x4(__lsx_vshuf4i_w(vec.val, 0xd8));
1614 __m128i zero = __lsx_vldi(0);
1615 return v_int8x16(__lsx_vshuf_b(zero, vec.val,
1616 _v128_set_d(0x1211100f0e0d0c0a, 0x0908060504020100)));
1619 {
return v_reinterpret_as_u8(
v_pack_triplets(v_reinterpret_as_s8(vec))); }
1623 __m128i zero = __lsx_vldi(0);
1624 return v_int16x8(__lsx_vshuf_b(zero, vec.val,
1625 _v128_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100)));
1628 {
return v_reinterpret_as_u16(
v_pack_triplets(v_reinterpret_as_s16(vec))); }
1641 __m128i
x = a.val,
y = b.val;
1642 return v_int32x4(__lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(
x,
y),
x,
y));
1646 __m128i
x = a.val,
y = b.val, z = c.val;
1647 __m128i t = __lsx_vmaddwev_w_h(z,
x,
y);
1654 __m128i
x = a.val,
y = b.val;
1655 return v_int64x2(__lsx_vmaddwod_d_w(__lsx_vmulwev_d_w(
x,
y),
x,
y));
1659 __m128i
x = a.val,
y = b.val, z = c.val;
1660 __m128i t = __lsx_vmaddwev_d_w(z,
x,
y);
1667 __m128i
x = a.val,
y = b.val;
1668 __m128i even = __lsx_vmulwev_h_bu(
x,
y);
1669 __m128i odd = __lsx_vmulwod_h_bu(
x,
y);
1670 __m128i prod0 = __lsx_vhaddw_wu_hu(even, even);
1671 __m128i prod1 = __lsx_vhaddw_wu_hu(odd, odd);
1672 return v_uint32x4(__lsx_vadd_w(prod0, prod1));
1680 __m128i
x = a.val,
y = b.val;
1681 __m128i even = __lsx_vmulwev_h_b(
x,
y);
1682 __m128i odd = __lsx_vmulwod_h_b(
x,
y);
1683 __m128i prod0 = __lsx_vhaddw_w_h(even, even);
1684 __m128i prod1 = __lsx_vhaddw_w_h(odd, odd);
1685 return v_int32x4(__lsx_vadd_w(prod0, prod1));
1693 __m128i
x = a.val,
y = b.val;
1694 __m128i even = __lsx_vmulwev_w_hu(
x,
y);
1695 __m128i odd = __lsx_vmulwod_w_hu(
x,
y);
1696 __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
1697 __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
1698 return v_uint64x2(__lsx_vadd_d(prod0, prod1));
1705 __m128i
x = a.val,
y = b.val;
1706 __m128i even = __lsx_vmulwev_w_h(
x,
y);
1707 __m128i odd = __lsx_vmulwod_w_h(
x,
y);
1708 __m128i prod0 = __lsx_vhaddw_d_w(even, even);
1709 __m128i prod1 = __lsx_vhaddw_d_w(odd, odd);
1710 return v_int64x2(__lsx_vadd_d(prod0, prod1));
1750 __m128i
x = a.val,
y = b.val;
1751 __m128i even = __lsx_vmulwev_w_hu(
x,
y);
1752 __m128i odd = __lsx_vmulwod_w_hu(
x,
y);
1753 __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
1754 __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
1755 return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
1762 __m128i
x = a.val,
y = b.val;
1763 __m128i prod = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(
x,
y),
x,
y);
1764 __m128i sign = __lsx_vsrai_w(prod, 31);
1765 __m128i lo = __lsx_vilvl_w(sign, prod);
1766 __m128i hi = __lsx_vilvh_w(sign, prod);
1781 __m128i
x = (__m128i)v.val;
1782 __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x0), m0.val);
1783 __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x55), m1.val);
1784 __m128 v2 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0xAA), m2.val);
1785 __m128 v3 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0xFF), m3.val);
1787 return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), __lsx_vfadd_s(v2, v3)));
1793 __m128i
x = (__m128i)v.val;
1794 __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x0), m0.val);
1795 __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x55), m1.val);
1796 __m128 v2 = __lsx_vfmadd_s((__m128)__lsx_vshuf4i_w(
x, 0xAA), m2.val, a.val);
1798 return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), v2));
1801 #define OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(_Tpvec, cast_from, cast_to) \
1802 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1803 const _Tpvec& a2, const _Tpvec& a3, \
1804 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1806 __m128i t0 = cast_from(__lsx_vilvl_w(a1.val, a0.val)); \
1807 __m128i t1 = cast_from(__lsx_vilvl_w(a3.val, a2.val)); \
1808 __m128i t2 = cast_from(__lsx_vilvh_w(a1.val, a0.val)); \
1809 __m128i t3 = cast_from(__lsx_vilvh_w(a3.val, a2.val)); \
1810 b0.val = cast_to(__lsx_vilvl_d(t1, t0)); \
1811 b1.val = cast_to(__lsx_vilvh_d(t1, t0)); \
1812 b2.val = cast_to(__lsx_vilvl_d(t3, t2)); \
1813 b3.val = cast_to(__lsx_vilvh_d(t3, t2)); \
1816 OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(
v_uint32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1817 OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(
v_int32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1823 __m128i vec0 = (__m128i)a0.val, vec1 = (__m128i)a1.val;
1824 __m128i
vec2 = (__m128i)a2.val, vec3 = (__m128i)a3.val;
1825 __m128i t0 = __lsx_vilvl_w(vec1, vec0);
1826 __m128i t1 = __lsx_vilvl_w(vec3,
vec2);
1827 __m128i t2 = __lsx_vilvh_w(vec1, vec0);
1828 __m128i t3 = __lsx_vilvh_w(vec3,
vec2);
1829 b0.val = __m128(__lsx_vilvl_d(t1, t0));
1830 b1.val = __m128(__lsx_vilvh_d(t1, t0));
1831 b2.val = __m128(__lsx_vilvl_d(t3, t2));
1832 b3.val = __m128(__lsx_vilvh_d(t3, t2));
1838 #define OPENCV_HAL_IMPL_LSX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin_lo, intrin_hi) \
1839 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1841 b0.val = intrin_lo(a.val, 0); \
1842 b1.val = intrin_hi(a.val); \
1844 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1845 { return _Tpwvec(intrin_lo(a.val, 0)); } \
1846 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1847 { return _Tpwvec(intrin_hi(a.val)); } \
1848 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1850 __m128i a = __lsx_vld(ptr, 0); \
1851 return _Tpwvec(intrin_lo(a, 0)); \
1857 OPENCV_HAL_IMPL_LSX_EXPAND(
v_int16x8,
v_int32x4,
short, __lsx_vsllwil_w_h, __lsx_vexth_w_h)
1858 OPENCV_HAL_IMPL_LSX_EXPAND(
v_uint32x4,
v_uint64x2,
unsigned, __lsx_vsllwil_du_wu, __lsx_vexth_du_wu)
1859 OPENCV_HAL_IMPL_LSX_EXPAND(
v_int32x4,
v_int64x2,
int, __lsx_vsllwil_d_w, __lsx_vexth_d_w)
1861 #define OPENCV_HAL_IMPL_LSX_EXPAND_Q(_Tpvec, _Tp, intrin_lo, intrin_hi) \
1862 inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1864 __m128i a = __lsx_vld(ptr, 0); \
1865 __m128i b = intrin_lo(a, 0); \
1866 return _Tpvec(intrin_hi(b, 0)); \
1869 OPENCV_HAL_IMPL_LSX_EXPAND_Q(
v_uint32x4,
uchar, __lsx_vsllwil_hu_bu, __lsx_vsllwil_wu_hu)
1870 OPENCV_HAL_IMPL_LSX_EXPAND_Q(
v_int32x4,
schar, __lsx_vsllwil_h_b, __lsx_vsllwil_w_h)
1875 {
return v_int8x16(_lsx_packs_h(a.val, b.val)); }
1878 {
return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, 0)); }
1881 {
return v_uint8x16(_lsx_packus_h(a.val, b.val)); }
1892 template<
int n>
inline
1894 {
return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, n)); }
1896 template<
int n>
inline
1898 { __lsx_vstelm_d(__lsx_vssrlrni_bu_h(a.val, a.val, n), ptr, 0, 0); }
1900 template<
int n>
inline
1902 {
return v_uint8x16(__lsx_vssrarni_bu_h(b.val, a.val, n)); }
1904 template<
int n>
inline
1906 { __lsx_vstelm_d(__lsx_vssrarni_bu_h(a.val, a.val, n), ptr, 0, 0); }
1908 template<
int n>
inline
1910 {
return v_int8x16(__lsx_vssrarni_b_h(b.val, a.val, n)); }
1912 template<
int n>
inline
1914 { __lsx_vstelm_d(__lsx_vssrarni_b_h(a.val, a.val, n), ptr, 0, 0); }
1918 {
return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, 0)); }
1921 {
return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, 0)); }
1924 {
return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, 0)); }
1930 { __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
1933 { __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
1935 template<
int n>
inline
1937 {
return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, n)); }
1939 template<
int n>
inline
1941 { __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, n), ptr, 0, 0); }
1943 template<
int n>
inline
1945 {
return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, n)); }
1947 template<
int n>
inline
1949 { __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, n), ptr, 0, 0); }
1951 template<
int n>
inline
1953 {
return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, n)); }
1955 template<
int n>
inline
1956 void v_rshr_pack_store(
short* ptr,
const v_int32x4& a)
1957 { __lsx_vstelm_d(__lsx_vssrarni_h_w(a.val, a.val, n), ptr, 0, 0); }
1962 {
return v_uint32x4(__lsx_vpickev_w(b.val, a.val)); }
1965 {
return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
1968 { __lsx_vstelm_d(__lsx_vshuf4i_w(a.val, 0x08), ptr, 0, 0); }
1971 {
v_pack_store((
unsigned*)ptr, v_reinterpret_as_u64(a)); }
1973 template<
int n>
inline
1975 {
return v_uint32x4(__lsx_vsrlrni_w_d(b.val, a.val, n)); }
1977 template<
int n>
inline
1978 void v_rshr_pack_store(
unsigned* ptr,
const v_uint64x2& a)
1979 { __lsx_vstelm_d(__lsx_vsrlrni_w_d(a.val, a.val, n), ptr, 0, 0); }
1981 template<
int n>
inline
1983 {
return v_int32x4(__lsx_vsrarni_w_d(b.val, a.val, n)); }
1985 template<
int n>
inline
1986 void v_rshr_pack_store(
int* ptr,
const v_int64x2& a)
1987 { __lsx_vstelm_d(__lsx_vsrarni_w_d(a.val, a.val, n), ptr, 0, 0); }
1991 {
return v_uint8x16(__lsx_vssrarni_b_h(b.val, a.val, 0)); }
1996 __m128i ab = __lsx_vssrarni_h_w(b.val, a.val, 0);
1997 __m128i cd = __lsx_vssrarni_h_w(d.val, c.val, 0);
1998 return v_uint8x16(__lsx_vssrarni_b_h(cd, ab, 0));
2005 __m128i ab = __lsx_vssrarni_w_d(b.val, a.val, 0);
2006 __m128i cd = __lsx_vssrarni_w_d(d.val, c.val, 0);
2007 __m128i ef = __lsx_vssrarni_w_d(f.val, e.val, 0);
2008 __m128i gh = __lsx_vssrarni_w_d(h.val, g.val, 0);
2010 __m128i abcd = __lsx_vssrarni_h_w(cd, ab, 0);
2011 __m128i efgh = __lsx_vssrarni_h_w(gh, ef, 0);
2012 return v_uint8x16(__lsx_vssrarni_b_h(efgh, abcd, 0));
2019 #define OPENCV_HAL_IMPL_LSX_EXTRACT(_Tpvec) \
2021 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2022 { return v_rotate_right<s>(a, b); }
2035 #define OPENCV_HAL_IMPL_LSX_EXTRACT_N(_Tpvec, _Twvec, intrin) \
2037 inline _Twvec v_extract_n(const _Tpvec& a) \
2038 { return (_Twvec)intrin(a.val, i); }
2041 OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int8x16,
schar, __lsx_vpickve2gr_b)
2043 OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int16x8,
short, __lsx_vpickve2gr_h)
2044 OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_uint32x4,
uint, __lsx_vpickve2gr_w)
2045 OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int32x4,
int, __lsx_vpickve2gr_w)
2047 OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int64x2,
int64, __lsx_vpickve2gr_d)
2052 union {
uint iv;
float fv; } d;
2053 d.iv = __lsx_vpickve2gr_w(v.val, i);
2060 union {
uint64 iv;
double dv; } d;
2061 d.iv = __lsx_vpickve2gr_d(v.val, i);
2067 {
return v_uint32x4(__lsx_vreplvei_w(a.val, i)); }
2071 {
return v_int32x4(__lsx_vreplvei_w(a.val, i)); }
2075 {
return v_float32x4((__m128)__lsx_vreplvei_w((__m128i)a.val, i)); }
2081 __m128i t0 = __lsx_vld(ptr, 0);
2082 __m128i t1 = __lsx_vld(ptr, 16);
2084 a.val = __lsx_vpickev_b(t1, t0);
2085 b.val = __lsx_vpickod_b(t1, t0);
2090 __m128i t0 = __lsx_vld(ptr, 0);
2091 __m128i t1 = __lsx_vld(ptr, 16);
2092 a.val = __lsx_vpickev_h(t1, t0);
2093 b.val = __lsx_vpickod_h(t1, t0);
2098 __m128i t0 = __lsx_vld(ptr, 0);
2099 __m128i t1 = __lsx_vld(ptr, 16);
2100 a.val = __lsx_vpickev_w(t1, t0);
2101 b.val = __lsx_vpickod_w(t1, t0);
2106 __m128i t0 = __lsx_vld(ptr, 0);
2107 __m128i t1 = __lsx_vld(ptr, 16);
2108 a.val = __lsx_vilvl_d(t1, t0);
2109 b.val = __lsx_vilvh_d(t1, t0);
2114 __m128i t0 = __lsx_vld(ptr, 0);
2115 __m128i t1 = __lsx_vld(ptr, 16);
2116 __m128i t2 = __lsx_vld(ptr, 32);
2117 const __m128i shuff0 = _v128_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2118 const __m128i shuff1 = _v128_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2119 __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff0);
2120 __m128i b0 = __lsx_vbitsel_v(t1, t0, shuff1);
2121 __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
2122 const __m128i shuff_a = _v128_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29);
2123 const __m128i shuff_b = _v128_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30);
2124 const __m128i shuff_c = _v128_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31);
2126 a.val = __lsx_vshuf_b(t2, a0, shuff_a);
2127 b.val = __lsx_vshuf_b(t2, b0, shuff_b);
2128 c.val = __lsx_vshuf_b(t2, c0, shuff_c);
2133 __m128i t0 = __lsx_vld(ptr, 0);
2134 __m128i t1 = __lsx_vld(ptr, 16);
2135 __m128i t2 = __lsx_vld(ptr, 32);
2136 const __m128i shuff0 = _v128_setr_h(0, 0, -1, 0, 0, -1, 0, 0);
2137 const __m128i shuff1 = _v128_setr_h(0, -1, 0, 0, -1, 0, 0, -1);
2139 __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff1);
2140 __m128i b0 = __lsx_vbitsel_v(t0, t1, shuff0);
2141 __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
2143 const __m128i shuff_a = _v128_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 20, 21, 26, 27);
2144 const __m128i shuff_b = _v128_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 16, 17, 22, 23, 28, 29);
2145 const __m128i shuff_c = _v128_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31);
2147 a.val = __lsx_vshuf_b(t2, a0, shuff_a);
2148 b.val = __lsx_vshuf_b(t2, b0, shuff_b);
2149 c.val = __lsx_vshuf_b(t2, c0, shuff_c);
2154 __m128i t0 = __lsx_vld(ptr, 0);
2155 __m128i t1 = __lsx_vld(ptr, 16);
2156 __m128i t2 = __lsx_vld(ptr, 32);
2158 __m128i a0 = __lsx_vpermi_w(t1, t0, 0xAC);
2159 __m128i b0 = __lsx_vpermi_w(t1, t0, 0xC5);
2160 __m128i c0 = __lsx_vpermi_w(t1, t0, 0x5A);
2162 a.val = __lsx_vextrins_w(a0, t2, 0x31);
2163 b0 = __lsx_vshuf4i_w(b0, 0x38);
2164 c0 = __lsx_vshuf4i_w(c0, 0x8);
2165 b.val = __lsx_vextrins_w(b0, t2, 0x32);
2166 c.val = __lsx_vpermi_w(t2, c0, 0xC4);
2171 __m128i t0 = __lsx_vld(ptr, 0);
2172 __m128i t1 = __lsx_vld(ptr, 16);
2173 __m128i t2 = __lsx_vld(ptr, 32);
2175 a.val = __lsx_vshuf4i_d(t0, t1, 0xC);
2176 b.val = __lsx_vshuf4i_d(t0, t2, 0x9);
2177 c.val = __lsx_vshuf4i_d(t1, t2, 0xC);
2182 __m128i t0 = __lsx_vld(ptr, 0);
2183 __m128i t1 = __lsx_vld(ptr, 16);
2184 __m128i t2 = __lsx_vld(ptr, 32);
2185 __m128i t3 = __lsx_vld(ptr, 48);
2187 __m128i ac_lo = __lsx_vpickev_b(t1, t0);
2188 __m128i bd_lo = __lsx_vpickod_b(t1, t0);
2189 __m128i ac_hi = __lsx_vpickev_b(t3, t2);
2190 __m128i bd_hi = __lsx_vpickod_b(t3, t2);
2192 a.val = __lsx_vpickev_b(ac_hi, ac_lo);
2193 c.val = __lsx_vpickod_b(ac_hi, ac_lo);
2194 b.val = __lsx_vpickev_b(bd_hi, bd_lo);
2195 d.val = __lsx_vpickod_b(bd_hi, bd_lo);
2200 __m128i t0 = __lsx_vld(ptr, 0);
2201 __m128i t1 = __lsx_vld(ptr, 16);
2202 __m128i t2 = __lsx_vld(ptr, 32);
2203 __m128i t3 = __lsx_vld(ptr, 48);
2205 __m128i ac_lo = __lsx_vpickev_h(t1, t0);
2206 __m128i bd_lo = __lsx_vpickod_h(t1, t0);
2207 __m128i ac_hi = __lsx_vpickev_h(t3, t2);
2208 __m128i bd_hi = __lsx_vpickod_h(t3, t2);
2210 a.val = __lsx_vpickev_h(ac_hi, ac_lo);
2211 c.val = __lsx_vpickod_h(ac_hi, ac_lo);
2212 b.val = __lsx_vpickev_h(bd_hi, bd_lo);
2213 d.val = __lsx_vpickod_h(bd_hi, bd_lo);
2218 __m128i p0 = __lsx_vld(ptr, 0);
2219 __m128i p1 = __lsx_vld(ptr, 16);
2220 __m128i p2 = __lsx_vld(ptr, 32);
2221 __m128i p3 = __lsx_vld(ptr, 48);
2223 __m128i t0 = __lsx_vilvl_w(p1, p0);
2224 __m128i t1 = __lsx_vilvl_w(p3, p2);
2225 __m128i t2 = __lsx_vilvh_w(p1, p0);
2226 __m128i t3 = __lsx_vilvh_w(p3, p2);
2227 a.val = __lsx_vilvl_d(t1, t0);
2228 b.val = __lsx_vilvh_d(t1, t0);
2229 c.val = __lsx_vilvl_d(t3, t2);
2230 d.val = __lsx_vilvh_d(t3, t2);
2235 __m128i t0 = __lsx_vld(ptr, 0);
2236 __m128i t1 = __lsx_vld(ptr, 16);
2237 __m128i t2 = __lsx_vld(ptr, 32);
2238 __m128i t3 = __lsx_vld(ptr, 48);
2240 a.val = __lsx_vilvl_d(t2, t0);
2241 b.val = __lsx_vilvh_d(t2, t0);
2242 c.val = __lsx_vilvl_d(t3, t1);
2243 d.val = __lsx_vilvh_d(t3, t1);
2251 __m128i v0 = __lsx_vilvl_b(b.val, a.val);
2252 __m128i v1 = __lsx_vilvh_b(b.val, a.val);
2254 __lsx_vst(v0, ptr, 0);
2255 __lsx_vst(v1, ptr, 16);
2261 __m128i v0 = __lsx_vilvl_h(b.val, a.val);
2262 __m128i v1 = __lsx_vilvh_h(b.val, a.val);
2264 __lsx_vst(v0, ptr, 0);
2265 __lsx_vst(v1, ptr, 16);
2271 __m128i v0 = __lsx_vilvl_w(b.val, a.val);
2272 __m128i v1 = __lsx_vilvh_w(b.val, a.val);
2274 __lsx_vst(v0, ptr, 0);
2275 __lsx_vst(v1, ptr, 16);
2281 __m128i v0 = __lsx_vilvl_d(b.val, a.val);
2282 __m128i v1 = __lsx_vilvh_d(b.val, a.val);
2284 __lsx_vst(v0, ptr, 0);
2285 __lsx_vst(v1, ptr, 16);
2291 __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
2292 __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
2293 __m128i v_c = c.val;
2294 const __m128i shuff0 = _v128_setr_b(0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10);
2295 const __m128i shuff1 = _v128_setr_b(11, 21, 12, 13, 22, 14, 15, 23, 0, 0, 0, 0, 0, 0, 0, 0);
2296 const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 24, 18, 19, 25, 20, 21);
2297 const __m128i shuff3 = _v128_setr_b(26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31);
2298 __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
2300 __m128i
dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
2301 __m128i
dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
2302 __m128i
dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
2303 dst1 = __lsx_vshuf_b(abc,
dst1, shuff2);
2305 __lsx_vst(
dst0, ptr, 0);
2306 __lsx_vst(
dst1, ptr, 16);
2307 __lsx_vst(
dst2, ptr, 32);
2313 __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
2314 __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
2315 __m128i v_c = c.val;
2316 const __m128i shuff0 = _v128_setr_b(0, 1, 2, 3, 16, 17, 4, 5, 6, 7, 18, 19, 8, 9, 10, 11);
2317 const __m128i shuff1 = _v128_setr_b(20, 21, 12, 13, 14, 15, 22, 23, 0, 0, 0, 0, 0, 0, 0, 0);
2318 const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 24, 25, 20, 21);
2319 const __m128i shuff3 = _v128_setr_b(6, 7, 26, 27, 8, 9, 10, 11, 28, 29, 12, 13, 14, 15, 30, 31);
2320 __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
2322 __m128i
dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
2323 __m128i
dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
2324 __m128i
dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
2325 dst1 = __lsx_vshuf_b(abc,
dst1, shuff2);
2327 __lsx_vst(
dst0, ptr, 0);
2328 __lsx_vst(
dst1, ptr, 16);
2329 __lsx_vst(
dst2, ptr, 32);
2335 __m128i v_c = c.val;
2336 __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);
2337 __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);
2338 __m128i bc_od = __lsx_vpackod_w(v_c, b.val);
2340 __m128i
dst0 = __lsx_vshuf4i_w(ab_lo, 0xB4);
2341 __m128i
dst1 = __lsx_vilvl_d(ab_hi, bc_od);
2342 __m128i
dst2 = __lsx_vpermi_w(bc_od, ab_hi, 0xE8);
2344 dst0 = __lsx_vextrins_w(
dst0, v_c, 0x20);
2345 dst2 = __lsx_vextrins_w(
dst2, v_c, 0x2);
2346 __lsx_vst(
dst0, ptr, 0);
2347 __lsx_vst(
dst1, ptr, 16);
2348 __lsx_vst(
dst2, ptr, 32);
2354 __m128i
dst0 = __lsx_vilvl_d(b.val, a.val);
2355 __m128i
dst1 = __lsx_vpermi_w(a.val, c.val, 0xE4);
2356 __m128i
dst2 = __lsx_vilvh_d(c.val, b.val);
2358 __lsx_vst(
dst0, ptr, 0);
2359 __lsx_vst(
dst1, ptr, 16);
2360 __lsx_vst(
dst2, ptr, 32);
2367 __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
2368 __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
2369 __m128i cd_lo = __lsx_vilvl_b(d.val, c.val);
2370 __m128i cd_hi = __lsx_vilvh_b(d.val, c.val);
2372 __m128i
dst0 = __lsx_vilvl_h(cd_lo, ab_lo);
2373 __m128i
dst1 = __lsx_vilvh_h(cd_lo, ab_lo);
2374 __m128i
dst2 = __lsx_vilvl_h(cd_hi, ab_hi);
2375 __m128i
dst3 = __lsx_vilvh_h(cd_hi, ab_hi);
2377 __lsx_vst(
dst0, ptr, 0);
2378 __lsx_vst(
dst1, ptr, 16);
2379 __lsx_vst(
dst2, ptr, 32);
2380 __lsx_vst(
dst3, ptr, 48);
2387 __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
2388 __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
2389 __m128i cd_lo = __lsx_vilvl_h(d.val, c.val);
2390 __m128i cd_hi = __lsx_vilvh_h(d.val, c.val);
2392 __m128i
dst0 = __lsx_vilvl_w(cd_lo, ab_lo);
2393 __m128i
dst1 = __lsx_vilvh_w(cd_lo, ab_lo);
2394 __m128i
dst2 = __lsx_vilvl_w(cd_hi, ab_hi);
2395 __m128i
dst3 = __lsx_vilvh_w(cd_hi, ab_hi);
2397 __lsx_vst(
dst0, ptr, 0);
2398 __lsx_vst(
dst1, ptr, 16);
2399 __lsx_vst(
dst2, ptr, 32);
2400 __lsx_vst(
dst3, ptr, 48);
2407 __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);
2408 __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);
2409 __m128i cd_lo = __lsx_vilvl_w(d.val, c.val);
2410 __m128i cd_hi = __lsx_vilvh_w(d.val, c.val);
2412 __m128i
dst0 = __lsx_vilvl_d(cd_lo, ab_lo);
2413 __m128i
dst1 = __lsx_vilvh_d(cd_lo, ab_lo);
2414 __m128i
dst2 = __lsx_vilvl_d(cd_hi, ab_hi);
2415 __m128i
dst3 = __lsx_vilvh_d(cd_hi, ab_hi);
2417 __lsx_vst(
dst0, ptr, 0);
2418 __lsx_vst(
dst1, ptr, 16);
2419 __lsx_vst(
dst2, ptr, 32);
2420 __lsx_vst(
dst3, ptr, 48);
2427 __m128i
dst0 = __lsx_vilvl_d(b.val, a.val);
2428 __m128i
dst2 = __lsx_vilvh_d(b.val, a.val);
2429 __m128i
dst1 = __lsx_vilvl_d(d.val, c.val);
2430 __m128i
dst3 = __lsx_vilvh_d(d.val, c.val);
2432 __lsx_vst(
dst0, ptr, 0);
2433 __lsx_vst(
dst1, ptr, 16);
2434 __lsx_vst(
dst2, ptr, 32);
2435 __lsx_vst(
dst3, ptr, 48);
2438 #define OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2439 inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0) \
2442 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2443 a0 = v_reinterpret_as_##suffix0(a1); \
2444 b0 = v_reinterpret_as_##suffix0(b1); \
2446 inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0) \
2448 _Tpvec1 a1, b1, c1; \
2449 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2450 a0 = v_reinterpret_as_##suffix0(a1); \
2451 b0 = v_reinterpret_as_##suffix0(b1); \
2452 c0 = v_reinterpret_as_##suffix0(c1); \
2454 inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, \
2455 _Tpvec0& c0, _Tpvec0& d0) \
2457 _Tpvec1 a1, b1, c1, d1; \
2458 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2459 a0 = v_reinterpret_as_##suffix0(a1); \
2460 b0 = v_reinterpret_as_##suffix0(b1); \
2461 c0 = v_reinterpret_as_##suffix0(c1); \
2462 d0 = v_reinterpret_as_##suffix0(d1); \
2464 inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2465 hal::StoreMode =hal::STORE_UNALIGNED) \
2467 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2468 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2469 v_store_interleave((_Tp1*)ptr, a1, b1); \
2471 inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0,\
2472 hal::StoreMode =hal::STORE_UNALIGNED) \
2474 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2475 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2476 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2477 v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
2479 inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2480 const _Tpvec0& c0, const _Tpvec0& d0, \
2481 hal::StoreMode =hal::STORE_UNALIGNED) \
2483 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2484 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2485 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2486 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2487 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
2504 return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0)));
2507 for (
int i = 0; i < 4; i++)
2508 buf[i] = (
float)ptr[i];
2516 __m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val);
2517 __lsx_vstelm_d(res, ptr, 0, 0);
2521 for (
int i = 0; i < 4; i++)
2522 ptr[i] = hfloat(buf[i]);
2532 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
const int * idx
Definition: core_c.h:668
CvArr * dst0
Definition: core_c.h:988
const CvArr * vec2
Definition: core_c.h:1429
CvArr CvArr CvArr CvArr * dst3
Definition: core_c.h:989
CvArr CvArr * dst1
Definition: core_c.h:988
const CvArr CvArr * x
Definition: core_c.h:1195
CvArr CvArr CvArr * dst2
Definition: core_c.h:989
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2216
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition: intrin_cpp.hpp:2761
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1451
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
StoreMode
Definition: intrin.hpp:100
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition: dualquaternion.inl.hpp:274
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437