5#ifndef OPENCV_HAL_INTRIN_LSX_HPP
6#define OPENCV_HAL_INTRIN_LSX_HPP
11#define CV_SIMD128_64F 1
12#define CV_SIMD128_FP16 0
19CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
23inline __m128i _v128_setr_b(
char v0,
char v1,
char v2,
char v3,
char v4,
char v5,
char v6,
24 char v7,
char v8,
char v9,
char v10,
char v11,
char v12,
char v13,
char v14,
char v15)
26 return (__m128i)v16i8{ v0, v1, v2, v3, v4, v5, v6, v7,
27 v8, v9, v10, v11, v12, v13, v14, v15 };
30inline __m128i _v128_set_b(
char v0,
char v1,
char v2,
char v3,
char v4,
char v5,
char v6,
31 char v7,
char v8,
char v9,
char v10,
char v11,
char v12,
char v13,
char v14,
char v15)
33 return (__m128i)v16i8{ v15, v14, v13, v12, v11, v10, v9, v8,
34 v7, v6, v5, v4, v3, v2, v1, v0 };
37inline __m128i _v128_setr_h(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
40 return (__m128i)v8i16{ v0, v1, v2, v3, v4, v5, v6, v7 };
43inline __m128i _v128_setr_w(
int v0,
int v1,
int v2,
int v3)
45 return (__m128i)v4i32{ v0, v1, v2, v3 };
48inline __m128i _v128_set_w(
int v0,
int v1,
int v2,
int v3)
50 return (__m128i)v4i32{ v3, v2, v1, v0 };
53inline __m128i _v128_setall_w(
int v0)
55 return __lsx_vreplgr2vr_w(v0);
60 return (__m128i)v2i64{ v0, v1 };
65 return (__m128i)v2i64{ v1, v0 };
68inline __m128 _v128_setr_ps(
float v0,
float v1,
float v2,
float v3)
70 return (__m128)v4f32{ v0, v1, v2, v3 };
73inline __m128 _v128_setall_ps(
float v0)
75 return (__m128)v4f32{ v0, v0, v0, v0 };
78inline __m128d _v128_setr_pd(
double v0,
double v1)
80 return (__m128d)v2f64{ v0, v1 };
83inline __m128d _v128_setall_pd(
double v0)
85 return (__m128d)v2f64{ v0, v0 };
88inline __m128i _lsx_packus_h(
const __m128i& a,
const __m128i& b)
90 return __lsx_vssrarni_bu_h(b, a, 0);
93inline __m128i _lsx_packs_h(
const __m128i& a,
const __m128i& b)
95 return __lsx_vssrarni_b_h(b, a, 0);
98inline __m128i _lsx_packus_w(
const __m128i& a,
const __m128i& b)
100 return __lsx_vssrarni_hu_w(b, a, 0);
107 typedef uchar lane_type;
115 val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
120 return (
uchar)__lsx_vpickve2gr_bu(val, 0);
128 typedef schar lane_type;
129 enum { nlanes = 16 };
132 explicit v_int8x16(__m128i v) : val(v) {}
136 val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
141 return (
schar)__lsx_vpickve2gr_b(val, 0);
156 val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
161 return (
ushort)__lsx_vpickve2gr_hu(val, 0);
169 typedef short lane_type;
173 explicit v_int16x8(__m128i v) : val(v) {}
174 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
176 val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
181 return (
short)__lsx_vpickve2gr_h(val, 0);
189 typedef unsigned lane_type;
194 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
196 val = _v128_setr_w(v0, v1, v2, v3);
199 unsigned get0()
const
201 return (
unsigned)__lsx_vpickve2gr_wu(val, 0);
209 typedef int lane_type;
213 explicit v_int32x4(__m128i v) : val(v) {}
214 v_int32x4(
int v0,
int v1,
int v2,
int v3)
216 val = _v128_setr_w(v0, v1, v2, v3);
221 return (
int)__lsx_vpickve2gr_w(val, 0);
229 typedef float lane_type;
234 explicit v_float32x4(__m128i v) { val = *((__m128*)&v); }
235 v_float32x4(
float v0,
float v1,
float v2,
float v3)
237 val = _v128_setr_ps(v0, v1, v2, v3);
242 union {
int iv;
float fv; } d;
243 d.iv = __lsx_vpickve2gr_w(val, 0);
247 int get0toint()
const
249 __m128i
result = __lsx_vftintrz_w_s(val);
250 return (
int)__lsx_vpickve2gr_w(
result, 0);
265 val = _v128_setr_d(v0, v1);
270 return __lsx_vpickve2gr_du(val, 0);
278 typedef int64 lane_type;
282 explicit v_int64x2(__m128i v) : val(v) {}
285 val = _v128_setr_d(v0, v1);
290 return __lsx_vpickve2gr_d(val, 0);
298 typedef double lane_type;
303 explicit v_float64x2(__m128i v) { val = *((__m128d*)&v); }
306 val = _v128_setr_pd(v0, v1);
311 union {
int64 iv;
double fv; } d;
312 d.iv = __lsx_vpickve2gr_d(val, 0);
316 int64 get0toint64()
const
318 __m128i
result = __lsx_vftintrz_l_d(val);
327#define OPENCV_HAL_IMPL_LSX_LOADSTORE(_Tpvec, _Tp) \
328 inline _Tpvec v_load(const _Tp* ptr) \
329 { return _Tpvec(__lsx_vld(ptr, 0)); } \
330 inline _Tpvec v_load_aligned(const _Tp* ptr) \
331 { return _Tpvec(__lsx_vld(ptr, 0)); } \
332 inline _Tpvec v_load_low(const _Tp* ptr) \
333 { return _Tpvec(__lsx_vldrepl_d(ptr, 0)); } \
334 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
336 __m128i vl = __lsx_vldrepl_d(ptr0, 0); \
337 __m128i vh = __lsx_vldrepl_d(ptr1, 0); \
338 return _Tpvec(__lsx_vilvl_d(vh, vl)); \
340 inline void v_store(_Tp* ptr, const _Tpvec& a) \
341 { __lsx_vst(a.val, ptr, 0); } \
342 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
343 { __lsx_vst(a.val, ptr, 0); } \
344 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
345 { __lsx_vst(a.val, ptr, 0); } \
346 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
348 if ( mode == hal::STORE_UNALIGNED) \
349 __lsx_vst(a.val, ptr, 0); \
350 else if ( mode == hal::STORE_ALIGNED_NOCACHE) \
351 __lsx_vst(a.val, ptr, 0); \
353 __lsx_vst(a.val, ptr, 0); \
355 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
356 { __lsx_vstelm_d(a.val, ptr, 0, 0); } \
357 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
358 { __lsx_vstelm_d(a.val, ptr, 0, 1); } \
363OPENCV_HAL_IMPL_LSX_LOADSTORE(
v_int16x8,
short)
364OPENCV_HAL_IMPL_LSX_LOADSTORE(
v_uint32x4,
unsigned)
365OPENCV_HAL_IMPL_LSX_LOADSTORE(
v_int32x4,
int)
369#define OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg) \
370 inline _Tpvec v_load(const _Tp* ptr) \
371 { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); } \
372 inline _Tpvec v_load_aligned(const _Tp* ptr) \
373 { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); } \
374 inline _Tpvec v_load_low(const _Tp* ptr) \
375 { return _Tpvec((halfreg)__lsx_vldrepl_d(ptr, 0)); } \
376 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
378 __m128i vl = __lsx_vldrepl_d(ptr0, 0); \
379 __m128i vh = __lsx_vldrepl_d(ptr1, 0); \
380 return _Tpvec((halfreg)__lsx_vilvl_d(vh, vl)); \
382 inline void v_store(_Tp* ptr, const _Tpvec& a) \
383 { __lsx_vst((__m128i)a.val, ptr, 0); } \
384 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
385 { __lsx_vst((__m128i)a.val, ptr, 0); } \
386 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
387 { __lsx_vst((__m128i)a.val, ptr, 0); } \
388 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
390 if( mode == hal::STORE_UNALIGNED) \
391 __lsx_vst((__m128i)a.val, ptr, 0); \
392 else if( mode == hal::STORE_ALIGNED_NOCACHE) \
393 __lsx_vst((__m128i)a.val, ptr, 0); \
395 __lsx_vst((__m128i)a.val, ptr, 0); \
397 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
398 { __lsx_vstelm_d((__m128i)a.val, ptr, 0, 0); } \
399 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
400 { __lsx_vstelm_d((__m128i)a.val, ptr, 0, 1); } \
402OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(
v_float32x4,
float, __m128)
403OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(
v_float64x2,
double, __m128d)
405inline __m128i _lsx_128_castps_si128(
const __m128& v)
406{
return __m128i(v); }
408inline __m128i _lsx_128_castpd_si128(
const __m128d& v)
409{
return __m128i(v); }
411#define OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
412 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
413 { return _Tpvec(cast(a.val)); }
415#define OPENCV_HAL_IMPL_LSX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
416 inline _Tpvec v_setzero_##suffix() \
417 { return _Tpvec(__lsx_vldi(0)); } \
418 inline _Tpvec v_setall_##suffix(_Tp v) \
419 { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); } \
420 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, OPENCV_HAL_NOP) \
421 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, OPENCV_HAL_NOP) \
422 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, OPENCV_HAL_NOP) \
423 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8, suffix, OPENCV_HAL_NOP) \
424 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4, suffix, OPENCV_HAL_NOP) \
425 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4, suffix, OPENCV_HAL_NOP) \
426 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2, suffix, OPENCV_HAL_NOP) \
427 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2, suffix, OPENCV_HAL_NOP) \
428 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float32x4, suffix, _lsx_128_castps_si128) \
429 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float64x2, suffix, _lsx_128_castpd_si128) \
434OPENCV_HAL_IMPL_LSX_INIT(
v_int16x8,
short, s16, h,
int)
435OPENCV_HAL_IMPL_LSX_INIT(
v_uint32x4,
unsigned, u32, w,
int)
436OPENCV_HAL_IMPL_LSX_INIT(
v_int32x4,
int, s32, w,
int)
440inline __m128 _lsx_128_castsi128_ps(
const __m128i &v)
443inline __m128d _lsx_128_castsi128_pd(
const __m128i &v)
444{
return __m128d(v); }
446#define OPENCV_HAL_IMPL_LSX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
447 inline _Tpvec v_setzero_##suffix() \
448 { return _Tpvec(__lsx_vldi(0)); } \
449 inline _Tpvec v_setall_##suffix(_Tp v) \
450 { return _Tpvec(_v128_setall_##zsuffix(v)); } \
451 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, cast) \
452 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, cast) \
453 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, cast) \
454 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8, suffix, cast) \
455 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4, suffix, cast) \
456 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4, suffix, cast) \
457 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2, suffix, cast) \
458 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2, suffix, cast) \
460OPENCV_HAL_IMPL_LSX_INIT_FLT(
v_float32x4,
float, f32, ps, _lsx_128_castsi128_ps)
461OPENCV_HAL_IMPL_LSX_INIT_FLT(
v_float64x2,
double, f64, pd, _lsx_128_castsi128_pd)
466{
return v_float32x4(_lsx_128_castps_si128(__m128(a.val))); }
471{
return v_float64x2(_lsx_128_castpd_si128(__m128d(a.val))); }
476#define OPENCV_HAL_IMPL_LSX_UNPACK(_Tpvec, suffix) \
477 inline _Tpvec v128_unpacklo(const _Tpvec& a, const _Tpvec& b) \
478 { return _Tpvec(__lsx_vilvl_##suffix(__m128i(b.val), __m128i(a.val))); } \
479 inline _Tpvec v128_unpackhi(const _Tpvec& a, const _Tpvec& b) \
480 { return _Tpvec(__lsx_vilvh_##suffix(__m128i(b.val), __m128i(a.val))); } \
494#define OPENCV_HAL_IMPL_LSX_ZIP(_Tpvec) \
495 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
496 { return (_Tpvec)__lsx_vilvl_d((__m128i)b.val, (__m128i)a.val); } \
497 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
498 { return (_Tpvec)__lsx_vilvh_d((__m128i)b.val, (__m128i)a.val); } \
499 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
500 _Tpvec& c, _Tpvec& d) \
502 __m128i a1 = (__m128i)a.val, b1 = (__m128i)b.val; \
503 c = _Tpvec(__lsx_vilvl_d(b1, a1)); \
504 d = _Tpvec(__lsx_vilvh_d(b1, a1)); \
506 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
507 _Tpvec& ab0, _Tpvec& ab1) \
509 ab0 = v128_unpacklo(a, b); \
510 ab1 = v128_unpackhi(a, b); \
527#define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin) \
528 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
529 { return _Tpvec(intrin(a.val, b.val)); } \
530 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
531 { a.val = intrin(a.val, b.val); return a; }
533OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_uint8x16, __lsx_vsadd_bu)
534OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_uint8x16, __lsx_vssub_bu)
535OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_int8x16, __lsx_vsadd_b)
536OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_int8x16, __lsx_vssub_b)
537OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_uint16x8, __lsx_vsadd_hu)
538OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_uint16x8, __lsx_vssub_hu)
539OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_int16x8, __lsx_vsadd_h)
540OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_int16x8, __lsx_vssub_h)
541OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_uint32x4, __lsx_vadd_w)
542OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_uint32x4, __lsx_vsub_w)
543OPENCV_HAL_IMPL_LSX_BIN_OP(*,
v_uint32x4, __lsx_vmul_w)
544OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_int32x4, __lsx_vadd_w)
545OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_int32x4, __lsx_vsub_w)
546OPENCV_HAL_IMPL_LSX_BIN_OP(*,
v_int32x4, __lsx_vmul_w)
547OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_uint64x2, __lsx_vadd_d)
548OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_uint64x2, __lsx_vsub_d)
549OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_int64x2, __lsx_vadd_d)
550OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_int64x2, __lsx_vsub_d)
552OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_float32x4, __lsx_vfadd_s)
553OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_float32x4, __lsx_vfsub_s)
554OPENCV_HAL_IMPL_LSX_BIN_OP(*,
v_float32x4, __lsx_vfmul_s)
555OPENCV_HAL_IMPL_LSX_BIN_OP(/,
v_float32x4, __lsx_vfdiv_s)
556OPENCV_HAL_IMPL_LSX_BIN_OP(+,
v_float64x2, __lsx_vfadd_d)
557OPENCV_HAL_IMPL_LSX_BIN_OP(-,
v_float64x2, __lsx_vfsub_d)
558OPENCV_HAL_IMPL_LSX_BIN_OP(*,
v_float64x2, __lsx_vfmul_d)
559OPENCV_HAL_IMPL_LSX_BIN_OP(/,
v_float64x2, __lsx_vfdiv_d)
576 __m128i a0 = a.val, b0 = b.val;
577 __m128i pev = __lsx_vmulwev_w_hu(a0, b0);
578 __m128i pod = __lsx_vmulwod_w_hu(a0, b0);
579 __m128i pl = __lsx_vilvl_w(pod, pev);
580 __m128i ph = __lsx_vilvh_w(pod, pev);
581 return (
v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
585 __m128i a0 = a.val, b0 = b.val;
586 __m128i pev = __lsx_vmulwev_w_h(a0, b0);
587 __m128i pod = __lsx_vmulwod_w_h(a0, b0);
588 __m128i pl = __lsx_vilvl_w(pod, pev);
589 __m128i ph = __lsx_vilvh_w(pod, pev);
590 return (
v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
593{ a = a * b;
return a; }
595{ a = a * b;
return a; }
597{ a = a * b;
return a; }
599{ a = a * b;
return a; }
603#define OPENCV_HAL_IMPL_LSX_BIN_FUNC(func, _Tpvec, intrin) \
604 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
605 { return _Tpvec(intrin(a.val, b.val)); } \
607OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap,
v_uint8x16, __lsx_vadd_b)
608OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap,
v_int8x16, __lsx_vadd_b)
609OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap,
v_uint16x8, __lsx_vadd_h)
610OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap,
v_int16x8, __lsx_vadd_h)
611OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap,
v_uint8x16, __lsx_vsub_b)
612OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap,
v_int8x16, __lsx_vsub_b)
613OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap,
v_uint16x8, __lsx_vsub_h)
614OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap,
v_int16x8, __lsx_vsub_h)
615OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap,
v_uint16x8, __lsx_vmul_h)
616OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap,
v_int16x8, __lsx_vmul_h)
620 __m128i a0 = a.val, b0 = b.val;
621 __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
622 __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
628 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
635 __m128i a0 = a.val, b0 = b.val;
636 __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
637 __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
638 c.val = __lsx_vilvl_h(p1, p0);
639 d.val = __lsx_vilvh_h(p1, p0);
644 __m128i a0 = a.val, b0 = b.val;
645 __m128i p0 = __lsx_vmulwev_h_b(a0, b0);
646 __m128i p1 = __lsx_vmulwod_h_b(a0, b0);
647 c.val = __lsx_vilvl_h(p1, p0);
648 d.val = __lsx_vilvh_h(p1, p0);
653 __m128i a0 = a.val, b0 = b.val;
654 __m128i p0 = __lsx_vmulwev_w_h(a0, b0);
655 __m128i p1 = __lsx_vmulwod_w_h(a0, b0);
656 c.val = __lsx_vilvl_w(p1, p0);
657 d.val = __lsx_vilvh_w(p1, p0);
662 __m128i a0 = a.val, b0 = b.val;
663 __m128i p0 = __lsx_vmulwev_w_hu(a0, b0);
664 __m128i p1 = __lsx_vmulwod_w_hu(a0, b0);
665 c.val = __lsx_vilvl_w(p1, p0);
666 d.val = __lsx_vilvh_w(p1, p0);
671 __m128i a0 = a.val, b0 = b.val;
672 __m128i p0 = __lsx_vmulwev_d_wu(a0, b0);
673 __m128i p1 = __lsx_vmulwod_d_wu(a0, b0);
674 c.val = __lsx_vilvl_d(p1, p0);
675 d.val = __lsx_vilvh_d(p1, p0);
678{
return v_int16x8(__lsx_vmuh_h(a.val, b.val)); }
680{
return v_uint16x8(__lsx_vmuh_hu(a.val, b.val)); }
683#define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
684 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
685 { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
686 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
687 { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
688 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
689 { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
690 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
691 { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
693 inline _Tpuvec v_shl(const _Tpuvec& a) \
694 { return _Tpuvec(__lsx_vslli_##suffix(a.val, imm)); } \
696 inline _Tpsvec v_shl(const _Tpsvec& a) \
697 { return _Tpsvec(__lsx_vslli_##suffix(a.val, imm)); } \
699 inline _Tpuvec v_shr(const _Tpuvec& a) \
700 { return _Tpuvec(__lsx_vsrli_##suffix(a.val, imm)); } \
702 inline _Tpsvec v_shr(const _Tpsvec& a) \
703 { return _Tpsvec(__lsx_vsrai_##suffix(a.val, imm)); } \
710#define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix) \
711 OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix) \
712 OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix) \
713 OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix) \
714 inline _Tpvec operator ~(const _Tpvec& a) \
715 { return _Tpvec(__lsx_vnori_b(a.val, 0)); } \
718OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int8x16, v)
720OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int16x8, v)
722OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int32x4, v)
724OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int64x2, v)
726#define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
727 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
728 { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } \
729 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
730 { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val); \
734#define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast) \
735 OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast) \
736 OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast) \
737 OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast) \
738 inline _Tpvec operator ~ (const _Tpvec& a) \
739 { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); } \
741OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(
v_float32x4, _lsx_128_castsi128_ps)
742OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(
v_float64x2, _lsx_128_castsi128_pd)
745#define OPENCV_HAL_IMPL_LSX_SELECT(_Tpvec) \
746 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
747 { return _Tpvec(__lsx_vbitsel_v(b.val, a.val, mask.val)); } \
757{
return v_float32x4(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)
mask.val)); }
759{
return v_float64x2(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)
mask.val)); }
762#define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec) \
763 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
764 { return ~( a == b ); } \
765 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
767 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
768 { return ~(a < b); } \
769 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
772#define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
773 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
774 { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); } \
775 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
776 { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); } \
777 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
778 { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); } \
779 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
780 { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); } \
781 OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec) \
782 OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
788#define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix) \
789 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
790 { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); } \
791 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
792 { return ~(a == b); }
794OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(
v_uint64x2, d)
795OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(
v_int64x2, d)
797#define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
798 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
799 { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); } \
801#define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix) \
802 OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix) \
803 OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix) \
804 OPENCV_HAL_IMPL_LSX_CMP_FLT(<, vfcmp_clt, _Tpvec, ssuffix) \
805 OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix) \
811{
return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
814{
return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
817{
return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
820{
return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
823{
return v_float32x4(__lsx_vfcmp_cor_s(a.val, a.val)); }
826{
return v_float64x2(__lsx_vfcmp_cor_d(a.val, a.val)); }
829OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_uint8x16, __lsx_vmin_bu)
830OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_uint8x16, __lsx_vmax_bu)
831OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_int8x16, __lsx_vmin_b)
832OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_int8x16, __lsx_vmax_b)
833OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_uint16x8, __lsx_vmin_hu)
834OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_uint16x8, __lsx_vmax_hu)
835OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_int16x8, __lsx_vmin_h)
836OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_int16x8, __lsx_vmax_h)
837OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_uint32x4, __lsx_vmin_wu)
838OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_uint32x4, __lsx_vmax_wu)
839OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_int32x4, __lsx_vmin_w)
840OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_int32x4, __lsx_vmax_w)
841OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_float32x4, __lsx_vfmin_s)
842OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_float32x4, __lsx_vfmax_s)
843OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min,
v_float64x2, __lsx_vfmin_d)
844OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max,
v_float64x2, __lsx_vfmax_d)
847 bool is_invalid = ((imm < 0) || (imm > 16)),
848 bool is_first = (imm == 0),
849 bool is_half = (imm == 8),
850 bool is_second = (imm == 16),
851 bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
852class v_lsx_palignr_u8_class;
855class v_lsx_palignr_u8_class<imm, true, false, false, false, false>;
858class v_lsx_palignr_u8_class<imm, false, true, false, false, false>
861 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
869class v_lsx_palignr_u8_class<imm, false, false, true, false, false>
872 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
874 return __lsx_vshuf4i_d(a, b, 0x9);
879class v_lsx_palignr_u8_class<imm, false, false, false, true, false>
882 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
890class v_lsx_palignr_u8_class<imm, false, false, false, false, true>
893 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
895 enum { imm2 = (
sizeof(__m128i) - imm) };
896 return __lsx_vor_v(__lsx_vbsrl_v(a, imm), __lsx_vbsll_v(b, imm2));
901inline __m128i v_lsx_palignr_u8(
const __m128i& a,
const __m128i& b)
903 CV_StaticAssert((imm >= 0) && (imm <= 16),
"Invalid imm for v_lsx_palignr_u8");
904 return v_lsx_palignr_u8_class<imm>()(a, b);
907#define OPENCV_HAL_IMPL_LSX_ROTATE_CAST(_Tpvec, cast) \
909 inline _Tpvec v_rotate_right(const _Tpvec &a) \
911 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
912 __m128i ret = __lsx_vbsrl_v((__m128i)a.val, imm2); \
913 return _Tpvec(cast(ret)); \
916 inline _Tpvec v_rotate_left(const _Tpvec &a) \
918 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
919 __m128i ret = __lsx_vbsll_v((__m128i)a.val, imm2); \
920 return _Tpvec(cast(ret)); \
923 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
925 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
926 return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)a.val, (__m128i)b.val))); \
929 inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
931 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type))}; \
932 return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)b.val, (__m128i)a.val))); \
935OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint8x16, OPENCV_HAL_NOP) \
936OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int8x16, OPENCV_HAL_NOP) \
937OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint16x8, OPENCV_HAL_NOP) \
938OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int16x8, OPENCV_HAL_NOP) \
939OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint32x4, OPENCV_HAL_NOP) \
940OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int32x4, OPENCV_HAL_NOP) \
941OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint64x2, OPENCV_HAL_NOP) \
942OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int64x2, OPENCV_HAL_NOP) \
944OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_float32x4, _lsx_128_castsi128_ps)
945OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_float64x2, _lsx_128_castsi128_pd)
950 __m128i vec = __lsx_vshuf4i_b(a.val, 0x1B);
951 return v_uint8x16(__lsx_vshuf4i_w(vec, 0x1B));
955{
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
959 __m128i vec = __lsx_vshuf4i_h(a.val, 0x1B);
960 return v_uint16x8(__lsx_vshuf4i_w(vec, 0x4E));
964{
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
967{
return v_uint32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
970{
return v_int32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
973{
return v_uint64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
976{
return v_int64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
979{
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
982{
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
990 __m128i t1 = __lsx_vhaddw_hu_bu(a.val, a.val);
991 __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
992 __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
993 __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
994 return (
unsigned)__lsx_vpickve2gr_w(t4, 0);
999 __m128i t1 = __lsx_vhaddw_h_b(a.val, a.val);
1000 __m128i t2 = __lsx_vhaddw_w_h(t1, t1);
1001 __m128i t3 = __lsx_vhaddw_d_w(t2, t2);
1002 __m128i t4 = __lsx_vhaddw_q_d(t3, t3);
1003 return (
int)__lsx_vpickve2gr_w(t4, 0);
1006#define OPENCV_HAL_IMPL_LSX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1007 inline sctype v_reduce_##func(const _Tpvec& a) \
1009 __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1010 val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1011 val = intrin(val, __lsx_vbsrl_v(val, 2)); \
1012 val = intrin(val, __lsx_vbsrl_v(val, 1)); \
1013 return (sctype)__lsx_vpickve2gr_b(val, 0); \
1018OPENCV_HAL_IMPL_LSX_REDUCE_16(
v_int8x16,
schar, min, __lsx_vmin_b)
1019OPENCV_HAL_IMPL_LSX_REDUCE_16(
v_int8x16,
schar, max, __lsx_vmax_b)
1021#define OPENCV_HAL_IMPL_LSX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1022 inline sctype v_reduce_##func(const _Tpvec &a) \
1024 __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1025 val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1026 val = intrin(val, __lsx_vbsrl_v(val, 2)); \
1027 return (sctype)__lsx_vpickve2gr_h(val, 0); \
1032OPENCV_HAL_IMPL_LSX_REDUCE_8(
v_int16x8,
short, min, __lsx_vmin_h)
1033OPENCV_HAL_IMPL_LSX_REDUCE_8(
v_int16x8,
short, max, __lsx_vmax_h)
1035#define OPENCV_HAL_IMPL_LSX_REDUCE_4(_Tpvec, sctype, func, intrin) \
1036 inline sctype v_reduce_##func(const _Tpvec &a) \
1038 __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1039 val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1040 return (sctype)__lsx_vpickve2gr_w(val, 0); \
1043OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_uint32x4,
unsigned, min, __lsx_vmin_wu)
1044OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_uint32x4,
unsigned, max, __lsx_vmax_wu)
1045OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_int32x4,
int, min, __lsx_vmin_w)
1046OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_int32x4,
int, max, __lsx_vmax_w)
1048#define OPENCV_HAL_IMPL_LSX_REDUCE_FLT(func, intrin) \
1049 inline float v_reduce_##func(const v_float32x4 &a) \
1051 __m128 val = a.val; \
1052 val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 8)); \
1053 val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 4)); \
1054 float *fval = (float*)&val; \
1058OPENCV_HAL_IMPL_LSX_REDUCE_FLT(min, __lsx_vfmin_s)
1059OPENCV_HAL_IMPL_LSX_REDUCE_FLT(max, __lsx_vfmax_s)
1063 __m128i t1 = __lsx_vhaddw_d_w(a.val, a.val);
1064 __m128i t2 = __lsx_vhaddw_q_d(t1, t1);
1065 return (
int)__lsx_vpickve2gr_w(t2, 0);
1070 __m128i t1 = __lsx_vhaddw_du_wu(a.val, a.val);
1071 __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1072 return (
int)__lsx_vpickve2gr_w(t2, 0);
1077 __m128i t1 = __lsx_vhaddw_w_h(a.val, a.val);
1078 __m128i t2 = __lsx_vhaddw_d_w(t1, t1);
1079 __m128i t3 = __lsx_vhaddw_q_d(t2, t2);
1080 return (
int)__lsx_vpickve2gr_w(t3, 0);
1085 __m128i t1 = __lsx_vhaddw_wu_hu(a.val, a.val);
1086 __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1087 __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1088 return (
int)__lsx_vpickve2gr_w(t3, 0);
1093 __m128i val = (__m128i)a.val;
1094 val = __lsx_vbsrl_v(val, 8);
1095 __m128
result = __lsx_vfadd_s(a.val, (__m128)val);
1096 float *pa = (
float*)&
result;
1097 return (
float)(pa[0] + pa[1]);
1102 __m128i t0 = __lsx_vhaddw_qu_du(a.val, a.val);
1103 return (
uint64)__lsx_vpickve2gr_du(t0, 0);
1108 __m128i t0 = __lsx_vhaddw_q_d(a.val, a.val);
1109 return (
int64)__lsx_vpickve2gr_d(t0, 0);
1114 double *pa = (
double*)&a;
1115 return pa[0] + pa[1];
1121 __m128i a0 = (__m128i)a.val;
1122 __m128i b0 = (__m128i)b.val;
1123 __m128i c0 = (__m128i)c.val;
1124 __m128i d0 = (__m128i)d.val;
1125 __m128i ac_l = __lsx_vilvl_w(c0, a0);
1126 __m128i ac_h = __lsx_vilvh_w(c0, a0);
1127 __m128i bd_l = __lsx_vilvl_w(d0, b0);
1128 __m128i bd_h = __lsx_vilvh_w(d0, b0);
1129 __m128 ac = __lsx_vfadd_s((__m128)ac_l, (__m128)ac_h);
1130 __m128 bd = __lsx_vfadd_s((__m128)bd_l, (__m128)bd_h);
1131 return v_float32x4(__lsx_vfadd_s((__m128)__lsx_vilvl_w((__m128i)bd, (__m128i)ac),
1132 (__m128)__lsx_vilvh_w((__m128i)bd, (__m128i)ac)));
1137 __m128i t0 = __lsx_vabsd_b(a.val, b.val);
1138 __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
1139 __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
1140 __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
1141 __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
1142 return (
unsigned)__lsx_vpickve2gr_w(t4, 0);
1147 __m128i t0 = __lsx_vabsd_bu(a.val, b.val);
1148 __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
1149 __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
1150 __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
1151 __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
1152 return (
unsigned)__lsx_vpickve2gr_w(t4, 0);
1157 __m128i t0 = __lsx_vabsd_hu(a.val, b.val);
1158 __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
1159 __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1160 __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1161 return (
unsigned)__lsx_vpickve2gr_w(t3, 0);
1166 __m128i t0 = __lsx_vabsd_h(a.val, b.val);
1167 __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
1168 __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1169 __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1170 return (
unsigned)__lsx_vpickve2gr_w(t3, 0);
1175 __m128i t0 = __lsx_vabsd_wu(a.val, b.val);
1176 __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
1177 __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1178 return (
unsigned)__lsx_vpickve2gr_w(t2, 0);
1183 __m128i t0 = __lsx_vabsd_w(a.val, b.val);
1184 __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
1185 __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1186 return (
unsigned)__lsx_vpickve2gr_w(t2, 0);
1196#define OPENCV_HAL_IMPL_LSX_POPCOUNT(_Tpvec, _Tp, suffix) \
1197inline _Tpvec v_popcount(const _Tp& a) \
1198{ return _Tpvec(__lsx_vpcnt_##suffix(a.val)); }
1210#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
1211inline tt reinterpret_int(ft x) { union {ft l; tt i;} v; v.l = x; return v.i; }
1214OPENCV_HAL_IMPL_REINTERPRET_INT(
ushort,
short)
1215OPENCV_HAL_IMPL_REINTERPRET_INT(
short,
short)
1216OPENCV_HAL_IMPL_REINTERPRET_INT(
unsigned,
int)
1217OPENCV_HAL_IMPL_REINTERPRET_INT(
int,
int)
1218OPENCV_HAL_IMPL_REINTERPRET_INT(
float,
int)
1221OPENCV_HAL_IMPL_REINTERPRET_INT(
double,
int64)
1225 __m128i
result = __lsx_vmskltz_b(a.val);
1226 return __lsx_vpickve2gr_w(
result, 0);
1229{
return v_signmask(v_reinterpret_as_s8(a)) ;}
1233 __m128i
result = __lsx_vmskltz_h(a.val);
1234 return __lsx_vpickve2gr_w(
result, 0);
1237{
return v_signmask(v_reinterpret_as_s16(a)); }
1241 __m128i
result = __lsx_vmskltz_w(a.val);
1242 return __lsx_vpickve2gr_w(
result, 0);
1245{
return v_signmask(v_reinterpret_as_u32(a)); }
1249 __m128i
result = __lsx_vmskltz_d(a.val);
1250 return __lsx_vpickve2gr_w(
result, 0);
1253{
return v_signmask(v_reinterpret_as_u64(a)); }
1273#define OPENCV_HAL_IMPL_LSX_CHECK(_Tpvec, allmask) \
1274 inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1275 inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1277OPENCV_HAL_IMPL_LSX_CHECK(
v_int8x16, 65535)
1279OPENCV_HAL_IMPL_LSX_CHECK(
v_int16x8, 255);
1290#define OPENCV_HAL_IMPL_LSX_MULADD(_Tpvec, suffix) \
1291 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1292 { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); } \
1293 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec &b, const _Tpvec& c) \
1294 { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); } \
1295 inline _Tpvec v_sqrt(const _Tpvec& x) \
1296 { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); } \
1297 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1298 { return v_fma(a, a, b * b); } \
1299 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1300 { return v_sqrt(v_fma(a, a, b * b)); }
1306{
return v_int32x4(__lsx_vmadd_w(c.val, a.val, b.val)); }
1309{
return v_fma(a, b, c); }
1322#define OPENCV_HAL_IMPL_LSX_ABS(_Tpvec, suffix) \
1323 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1324 { return v_u##_Tpvec(__lsx_vabsd_##suffix(x.val, __lsx_vldi(0))); }
1326OPENCV_HAL_IMPL_LSX_ABS(int8x16, b)
1327OPENCV_HAL_IMPL_LSX_ABS(int16x8, h)
1328OPENCV_HAL_IMPL_LSX_ABS(int32x4, w)
1331{
return v_float32x4(*((__m128i*)&
x) & __lsx_vreplgr2vr_w(0x7fffffff)); }
1333{
return v_float64x2(*((__m128i*)&
x) & __lsx_vreplgr2vr_d(0x7fffffffffffffff)); }
1338{
return (
v_uint8x16)__lsx_vabsd_bu(a.val, b.val); }
1340{
return (
v_uint16x8)__lsx_vabsd_hu(a.val, b.val); }
1342{
return (
v_uint32x4)__lsx_vabsd_wu(a.val, b.val); }
1345{
return (
v_uint8x16)__lsx_vabsd_b(a.val, b.val); }
1347{
return (
v_uint16x8)__lsx_vabsd_h(a.val, b.val); }
1349{
return (
v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
1352{
return v_abs(a - b); }
1355{
return v_abs(a - b); }
1365{
return v_max(a, b) - v_min(a, b); }
1371{
return v_int32x4(__lsx_vftint_w_s(a.val)); }
1374{
return v_int32x4(__lsx_vftint_w_d(a.val, a.val)); }
1377{
return v_int32x4(__lsx_vftint_w_d(b.val, a.val)); }
1380{
return v_int32x4(__lsx_vftintrz_w_s(a.val)); }
1383{
return v_int32x4(__lsx_vftintrz_w_d(a.val, a.val)); }
1386{
return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrm_s(a.val)))); }
1392{
return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrp_s(a.val)))); }
1402{
return v_float32x4(__lsx_vfcvt_s_d(a.val, a.val)); }
1405{
return v_float32x4(__lsx_vfcvt_s_d(b.val, a.val)); }
1429 tab[
idx[14]], tab[
idx[15]]));
1434 return v_int8x16(_v128_setr_h(*(
const short*)(tab +
idx[0]), *(
const short*)(tab +
idx[1]),
1435 *(
const short*)(tab +
idx[2]), *(
const short*)(tab +
idx[3]), *(
const short*)(tab +
idx[4]),
1436 *(
const short*)(tab +
idx[5]), *(
const short*)(tab +
idx[6]), *(
const short*)(tab +
idx[7])));
1441 return v_int8x16(_v128_setr_w(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
1442 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3])));
1446{
return v_reinterpret_as_u8(
v_lut((
const schar*)tab,
idx)); }
1459 return v_int16x8(_v128_setr_w(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
1460 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3])));
1464 return v_int16x8(_v128_setr_d(*(
const int64_t*)(tab +
idx[0]), *(
const int64_t*)(tab +
idx[1])));
1468{
return v_reinterpret_as_u16(
v_lut((
const short *)tab,
idx)); }
1470{
return v_reinterpret_as_u16(
v_lut_pairs((
const short *)tab,
idx)); }
1472{
return v_reinterpret_as_u16(
v_lut_quads((
const short *)tab,
idx)); }
1480 return v_int32x4(_v128_setr_d(*(
const int64_t*)(tab +
idx[0]), *(
const int64_t*)(tab +
idx[1])));
1500inline v_uint64x2 v_lut(
const uint64_t* tab,
const int*
idx) {
return v_reinterpret_as_u64(
v_lut((
const int64_t *)tab,
idx)); }
1509 return v_float32x4((__m128)_v128_setr_pd(*(
const double*)(tab +
idx[0]), *(
const double*)(tab +
idx[1])));
1527 int *
idx = (
int*)&idxvec.val;
1533 return v_reinterpret_as_u32(
v_lut((
const int *)tab, idxvec));
1538 const int *
idx = (
const int*)&idxvec.val;
1544 const int *
idx = (
const int*)&idxvec.val;
1550 const int *
idx = (
const int*)&idxvec.val;
1551 __m128i xy0 = __lsx_vld(tab +
idx[0], 0);
1552 __m128i xy1 = __lsx_vld(tab +
idx[1], 0);
1553 __m128i xy2 = __lsx_vld(tab +
idx[2], 0);
1554 __m128i xy3 = __lsx_vld(tab +
idx[3], 0);
1555 __m128i xy01 = __lsx_vilvl_d(xy1, xy0);
1556 __m128i xy23 = __lsx_vilvl_d(xy3, xy2);
1557 __m128i xxyy02 = __lsx_vilvl_w(xy23, xy01);
1558 __m128i xxyy13 = __lsx_vilvh_w(xy23, xy01);
1559 x =
v_float32x4((__m128)__lsx_vilvl_w(xxyy13, xxyy02));
1560 y =
v_float32x4((__m128)__lsx_vilvh_w(xxyy13, xxyy02));
1565 const int*
idx = (
const int*)&idxvec.val;
1566 __m128i xy0 = __lsx_vld(tab +
idx[0], 0);
1567 __m128i xy1 = __lsx_vld(tab +
idx[1], 0);
1574 return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
1575 _v128_setr_d(0x0705060403010200, 0x0f0d0e0c0b090a08)));
1581 return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
1582 _v128_setr_d(0x0703060205010400, 0x0f0b0e0a0d090c08)));
1589 return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
1590 _v128_setr_d(0x0706030205040100, 0x0f0e0b0a0d0c0908)));
1596 return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
1597 _v128_setr_d(0x0b0a030209080100, 0x0f0e07060d0c0504)));
1604 return v_int32x4(__lsx_vshuf4i_w(vec.val, 0xd8));
1614 __m128i zero = __lsx_vldi(0);
1615 return v_int8x16(__lsx_vshuf_b(zero, vec.val,
1616 _v128_set_d(0x1211100f0e0d0c0a, 0x0908060504020100)));
1619{
return v_reinterpret_as_u8(
v_pack_triplets(v_reinterpret_as_s8(vec))); }
1623 __m128i zero = __lsx_vldi(0);
1624 return v_int16x8(__lsx_vshuf_b(zero, vec.val,
1625 _v128_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100)));
1628{
return v_reinterpret_as_u16(
v_pack_triplets(v_reinterpret_as_s16(vec))); }
1641 __m128i
x = a.val,
y = b.val;
1642 return v_int32x4(__lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(
x,
y),
x,
y));
1646 __m128i
x = a.val,
y = b.val, z = c.val;
1647 __m128i t = __lsx_vmaddwev_w_h(z,
x,
y);
1654 __m128i
x = a.val,
y = b.val;
1655 return v_int64x2(__lsx_vmaddwod_d_w(__lsx_vmulwev_d_w(
x,
y),
x,
y));
1659 __m128i
x = a.val,
y = b.val, z = c.val;
1660 __m128i t = __lsx_vmaddwev_d_w(z,
x,
y);
1667 __m128i
x = a.val,
y = b.val;
1668 __m128i even = __lsx_vmulwev_h_bu(
x,
y);
1669 __m128i odd = __lsx_vmulwod_h_bu(
x,
y);
1670 __m128i prod0 = __lsx_vhaddw_wu_hu(even, even);
1671 __m128i prod1 = __lsx_vhaddw_wu_hu(odd, odd);
1672 return v_uint32x4(__lsx_vadd_w(prod0, prod1));
1680 __m128i
x = a.val,
y = b.val;
1681 __m128i even = __lsx_vmulwev_h_b(
x,
y);
1682 __m128i odd = __lsx_vmulwod_h_b(
x,
y);
1683 __m128i prod0 = __lsx_vhaddw_w_h(even, even);
1684 __m128i prod1 = __lsx_vhaddw_w_h(odd, odd);
1685 return v_int32x4(__lsx_vadd_w(prod0, prod1));
1693 __m128i
x = a.val,
y = b.val;
1694 __m128i even = __lsx_vmulwev_w_hu(
x,
y);
1695 __m128i odd = __lsx_vmulwod_w_hu(
x,
y);
1696 __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
1697 __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
1698 return v_uint64x2(__lsx_vadd_d(prod0, prod1));
1705 __m128i
x = a.val,
y = b.val;
1706 __m128i even = __lsx_vmulwev_w_h(
x,
y);
1707 __m128i odd = __lsx_vmulwod_w_h(
x,
y);
1708 __m128i prod0 = __lsx_vhaddw_d_w(even, even);
1709 __m128i prod1 = __lsx_vhaddw_d_w(odd, odd);
1710 return v_int64x2(__lsx_vadd_d(prod0, prod1));
1750 __m128i
x = a.val,
y = b.val;
1751 __m128i even = __lsx_vmulwev_w_hu(
x,
y);
1752 __m128i odd = __lsx_vmulwod_w_hu(
x,
y);
1753 __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
1754 __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
1755 return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
1762 __m128i
x = a.val,
y = b.val;
1763 __m128i prod = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(
x,
y),
x,
y);
1764 __m128i sign = __lsx_vsrai_w(prod, 31);
1765 __m128i lo = __lsx_vilvl_w(sign, prod);
1766 __m128i hi = __lsx_vilvh_w(sign, prod);
1781 __m128i
x = (__m128i)v.val;
1782 __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x0), m0.val);
1783 __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x55), m1.val);
1784 __m128 v2 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0xAA), m2.val);
1785 __m128 v3 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0xFF), m3.val);
1787 return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), __lsx_vfadd_s(v2, v3)));
1793 __m128i
x = (__m128i)v.val;
1794 __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x0), m0.val);
1795 __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x55), m1.val);
1796 __m128 v2 = __lsx_vfmadd_s((__m128)__lsx_vshuf4i_w(
x, 0xAA), m2.val, a.val);
1798 return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), v2));
1801#define OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(_Tpvec, cast_from, cast_to) \
1802 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1803 const _Tpvec& a2, const _Tpvec& a3, \
1804 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1806 __m128i t0 = cast_from(__lsx_vilvl_w(a1.val, a0.val)); \
1807 __m128i t1 = cast_from(__lsx_vilvl_w(a3.val, a2.val)); \
1808 __m128i t2 = cast_from(__lsx_vilvh_w(a1.val, a0.val)); \
1809 __m128i t3 = cast_from(__lsx_vilvh_w(a3.val, a2.val)); \
1810 b0.val = cast_to(__lsx_vilvl_d(t1, t0)); \
1811 b1.val = cast_to(__lsx_vilvh_d(t1, t0)); \
1812 b2.val = cast_to(__lsx_vilvl_d(t3, t2)); \
1813 b3.val = cast_to(__lsx_vilvh_d(t3, t2)); \
1816OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(
v_uint32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1817OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(
v_int32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1823 __m128i vec0 = (__m128i)a0.val, vec1 = (__m128i)a1.val;
1824 __m128i
vec2 = (__m128i)a2.val, vec3 = (__m128i)a3.val;
1825 __m128i t0 = __lsx_vilvl_w(vec1, vec0);
1826 __m128i t1 = __lsx_vilvl_w(vec3,
vec2);
1827 __m128i t2 = __lsx_vilvh_w(vec1, vec0);
1828 __m128i t3 = __lsx_vilvh_w(vec3,
vec2);
1829 b0.val = __m128(__lsx_vilvl_d(t1, t0));
1830 b1.val = __m128(__lsx_vilvh_d(t1, t0));
1831 b2.val = __m128(__lsx_vilvl_d(t3, t2));
1832 b3.val = __m128(__lsx_vilvh_d(t3, t2));
1838#define OPENCV_HAL_IMPL_LSX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin_lo, intrin_hi) \
1839 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1841 b0.val = intrin_lo(a.val, 0); \
1842 b1.val = intrin_hi(a.val); \
1844 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1845 { return _Tpwvec(intrin_lo(a.val, 0)); } \
1846 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1847 { return _Tpwvec(intrin_hi(a.val)); } \
1848 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1850 __m128i a = __lsx_vld(ptr, 0); \
1851 return _Tpwvec(intrin_lo(a, 0)); \
1857OPENCV_HAL_IMPL_LSX_EXPAND(
v_int16x8,
v_int32x4,
short, __lsx_vsllwil_w_h, __lsx_vexth_w_h)
1858OPENCV_HAL_IMPL_LSX_EXPAND(
v_uint32x4,
v_uint64x2,
unsigned, __lsx_vsllwil_du_wu, __lsx_vexth_du_wu)
1859OPENCV_HAL_IMPL_LSX_EXPAND(
v_int32x4,
v_int64x2,
int, __lsx_vsllwil_d_w, __lsx_vexth_d_w)
1861#define OPENCV_HAL_IMPL_LSX_EXPAND_Q(_Tpvec, _Tp, intrin_lo, intrin_hi) \
1862 inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1864 __m128i a = __lsx_vld(ptr, 0); \
1865 __m128i b = intrin_lo(a, 0); \
1866 return _Tpvec(intrin_hi(b, 0)); \
1869OPENCV_HAL_IMPL_LSX_EXPAND_Q(
v_uint32x4,
uchar, __lsx_vsllwil_hu_bu, __lsx_vsllwil_wu_hu)
1870OPENCV_HAL_IMPL_LSX_EXPAND_Q(
v_int32x4,
schar, __lsx_vsllwil_h_b, __lsx_vsllwil_w_h)
1875{
return v_int8x16(_lsx_packs_h(a.val, b.val)); }
1878{
return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, 0)); }
1881{
return v_uint8x16(_lsx_packus_h(a.val, b.val)); }
1892template<
int n>
inline
1894{
return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, n)); }
1896template<
int n>
inline
1898{ __lsx_vstelm_d(__lsx_vssrlrni_bu_h(a.val, a.val, n), ptr, 0, 0); }
1900template<
int n>
inline
1902{
return v_uint8x16(__lsx_vssrarni_bu_h(b.val, a.val, n)); }
1904template<
int n>
inline
1906{ __lsx_vstelm_d(__lsx_vssrarni_bu_h(a.val, a.val, n), ptr, 0, 0); }
1908template<
int n>
inline
1910{
return v_int8x16(__lsx_vssrarni_b_h(b.val, a.val, n)); }
1912template<
int n>
inline
1914{ __lsx_vstelm_d(__lsx_vssrarni_b_h(a.val, a.val, n), ptr, 0, 0); }
1918{
return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, 0)); }
1921{
return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, 0)); }
1924{
return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, 0)); }
1930{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
1933{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
1935template<
int n>
inline
1937{
return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, n)); }
1939template<
int n>
inline
1941{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, n), ptr, 0, 0); }
1943template<
int n>
inline
1945{
return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, n)); }
1947template<
int n>
inline
1949{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, n), ptr, 0, 0); }
1951template<
int n>
inline
1953{
return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, n)); }
1955template<
int n>
inline
1956void v_rshr_pack_store(
short* ptr,
const v_int32x4& a)
1957{ __lsx_vstelm_d(__lsx_vssrarni_h_w(a.val, a.val, n), ptr, 0, 0); }
1962{
return v_uint32x4(__lsx_vpickev_w(b.val, a.val)); }
1965{
return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
1968{ __lsx_vstelm_d(__lsx_vshuf4i_w(a.val, 0x08), ptr, 0, 0); }
1971{
v_pack_store((
unsigned*)ptr, v_reinterpret_as_u64(a)); }
1973template<
int n>
inline
1975{
return v_uint32x4(__lsx_vsrlrni_w_d(b.val, a.val, n)); }
1977template<
int n>
inline
1978void v_rshr_pack_store(
unsigned* ptr,
const v_uint64x2& a)
1979{ __lsx_vstelm_d(__lsx_vsrlrni_w_d(a.val, a.val, n), ptr, 0, 0); }
1981template<
int n>
inline
1983{
return v_int32x4(__lsx_vsrarni_w_d(b.val, a.val, n)); }
1985template<
int n>
inline
1986void v_rshr_pack_store(
int* ptr,
const v_int64x2& a)
1987{ __lsx_vstelm_d(__lsx_vsrarni_w_d(a.val, a.val, n), ptr, 0, 0); }
1991{
return v_uint8x16(__lsx_vssrarni_b_h(b.val, a.val, 0)); }
1996 __m128i ab = __lsx_vssrarni_h_w(b.val, a.val, 0);
1997 __m128i cd = __lsx_vssrarni_h_w(d.val, c.val, 0);
1998 return v_uint8x16(__lsx_vssrarni_b_h(cd, ab, 0));
2005 __m128i ab = __lsx_vssrarni_w_d(b.val, a.val, 0);
2006 __m128i cd = __lsx_vssrarni_w_d(d.val, c.val, 0);
2007 __m128i ef = __lsx_vssrarni_w_d(f.val, e.val, 0);
2008 __m128i gh = __lsx_vssrarni_w_d(h.val, g.val, 0);
2010 __m128i abcd = __lsx_vssrarni_h_w(cd, ab, 0);
2011 __m128i efgh = __lsx_vssrarni_h_w(gh, ef, 0);
2012 return v_uint8x16(__lsx_vssrarni_b_h(efgh, abcd, 0));
2019#define OPENCV_HAL_IMPL_LSX_EXTRACT(_Tpvec) \
2021 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2022 { return v_rotate_right<s>(a, b); }
2035#define OPENCV_HAL_IMPL_LSX_EXTRACT_N(_Tpvec, _Twvec, intrin) \
2037inline _Twvec v_extract_n(const _Tpvec& a) \
2038{ return (_Twvec)intrin(a.val, i); }
2041OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int8x16,
schar, __lsx_vpickve2gr_b)
2043OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int16x8,
short, __lsx_vpickve2gr_h)
2044OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_uint32x4,
uint, __lsx_vpickve2gr_w)
2045OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int32x4,
int, __lsx_vpickve2gr_w)
2047OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int64x2,
int64, __lsx_vpickve2gr_d)
2052 union {
uint iv;
float fv; } d;
2053 d.iv = __lsx_vpickve2gr_w(v.val, i);
2060 union {
uint64 iv;
double dv; } d;
2061 d.iv = __lsx_vpickve2gr_d(v.val, i);
2067{
return v_uint32x4(__lsx_vreplvei_w(a.val, i)); }
2071{
return v_int32x4(__lsx_vreplvei_w(a.val, i)); }
2075{
return v_float32x4((__m128)__lsx_vreplvei_w((__m128i)a.val, i)); }
2081 __m128i t0 = __lsx_vld(ptr, 0);
2082 __m128i t1 = __lsx_vld(ptr, 16);
2084 a.val = __lsx_vpickev_b(t1, t0);
2085 b.val = __lsx_vpickod_b(t1, t0);
2090 __m128i t0 = __lsx_vld(ptr, 0);
2091 __m128i t1 = __lsx_vld(ptr, 16);
2092 a.val = __lsx_vpickev_h(t1, t0);
2093 b.val = __lsx_vpickod_h(t1, t0);
2098 __m128i t0 = __lsx_vld(ptr, 0);
2099 __m128i t1 = __lsx_vld(ptr, 16);
2100 a.val = __lsx_vpickev_w(t1, t0);
2101 b.val = __lsx_vpickod_w(t1, t0);
2106 __m128i t0 = __lsx_vld(ptr, 0);
2107 __m128i t1 = __lsx_vld(ptr, 16);
2108 a.val = __lsx_vilvl_d(t1, t0);
2109 b.val = __lsx_vilvh_d(t1, t0);
2114 __m128i t0 = __lsx_vld(ptr, 0);
2115 __m128i t1 = __lsx_vld(ptr, 16);
2116 __m128i t2 = __lsx_vld(ptr, 32);
2117 const __m128i shuff0 = _v128_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2118 const __m128i shuff1 = _v128_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2119 __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff0);
2120 __m128i b0 = __lsx_vbitsel_v(t1, t0, shuff1);
2121 __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
2122 const __m128i shuff_a = _v128_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29);
2123 const __m128i shuff_b = _v128_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30);
2124 const __m128i shuff_c = _v128_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31);
2126 a.val = __lsx_vshuf_b(t2, a0, shuff_a);
2127 b.val = __lsx_vshuf_b(t2, b0, shuff_b);
2128 c.val = __lsx_vshuf_b(t2, c0, shuff_c);
2133 __m128i t0 = __lsx_vld(ptr, 0);
2134 __m128i t1 = __lsx_vld(ptr, 16);
2135 __m128i t2 = __lsx_vld(ptr, 32);
2136 const __m128i shuff0 = _v128_setr_h(0, 0, -1, 0, 0, -1, 0, 0);
2137 const __m128i shuff1 = _v128_setr_h(0, -1, 0, 0, -1, 0, 0, -1);
2139 __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff1);
2140 __m128i b0 = __lsx_vbitsel_v(t0, t1, shuff0);
2141 __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
2143 const __m128i shuff_a = _v128_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 20, 21, 26, 27);
2144 const __m128i shuff_b = _v128_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 16, 17, 22, 23, 28, 29);
2145 const __m128i shuff_c = _v128_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31);
2147 a.val = __lsx_vshuf_b(t2, a0, shuff_a);
2148 b.val = __lsx_vshuf_b(t2, b0, shuff_b);
2149 c.val = __lsx_vshuf_b(t2, c0, shuff_c);
2154 __m128i t0 = __lsx_vld(ptr, 0);
2155 __m128i t1 = __lsx_vld(ptr, 16);
2156 __m128i t2 = __lsx_vld(ptr, 32);
2158 __m128i a0 = __lsx_vpermi_w(t1, t0, 0xAC);
2159 __m128i b0 = __lsx_vpermi_w(t1, t0, 0xC5);
2160 __m128i c0 = __lsx_vpermi_w(t1, t0, 0x5A);
2162 a.val = __lsx_vextrins_w(a0, t2, 0x31);
2163 b0 = __lsx_vshuf4i_w(b0, 0x38);
2164 c0 = __lsx_vshuf4i_w(c0, 0x8);
2165 b.val = __lsx_vextrins_w(b0, t2, 0x32);
2166 c.val = __lsx_vpermi_w(t2, c0, 0xC4);
2171 __m128i t0 = __lsx_vld(ptr, 0);
2172 __m128i t1 = __lsx_vld(ptr, 16);
2173 __m128i t2 = __lsx_vld(ptr, 32);
2175 a.val = __lsx_vshuf4i_d(t0, t1, 0xC);
2176 b.val = __lsx_vshuf4i_d(t0, t2, 0x9);
2177 c.val = __lsx_vshuf4i_d(t1, t2, 0xC);
2182 __m128i t0 = __lsx_vld(ptr, 0);
2183 __m128i t1 = __lsx_vld(ptr, 16);
2184 __m128i t2 = __lsx_vld(ptr, 32);
2185 __m128i t3 = __lsx_vld(ptr, 48);
2187 __m128i ac_lo = __lsx_vpickev_b(t1, t0);
2188 __m128i bd_lo = __lsx_vpickod_b(t1, t0);
2189 __m128i ac_hi = __lsx_vpickev_b(t3, t2);
2190 __m128i bd_hi = __lsx_vpickod_b(t3, t2);
2192 a.val = __lsx_vpickev_b(ac_hi, ac_lo);
2193 c.val = __lsx_vpickod_b(ac_hi, ac_lo);
2194 b.val = __lsx_vpickev_b(bd_hi, bd_lo);
2195 d.val = __lsx_vpickod_b(bd_hi, bd_lo);
2200 __m128i t0 = __lsx_vld(ptr, 0);
2201 __m128i t1 = __lsx_vld(ptr, 16);
2202 __m128i t2 = __lsx_vld(ptr, 32);
2203 __m128i t3 = __lsx_vld(ptr, 48);
2205 __m128i ac_lo = __lsx_vpickev_h(t1, t0);
2206 __m128i bd_lo = __lsx_vpickod_h(t1, t0);
2207 __m128i ac_hi = __lsx_vpickev_h(t3, t2);
2208 __m128i bd_hi = __lsx_vpickod_h(t3, t2);
2210 a.val = __lsx_vpickev_h(ac_hi, ac_lo);
2211 c.val = __lsx_vpickod_h(ac_hi, ac_lo);
2212 b.val = __lsx_vpickev_h(bd_hi, bd_lo);
2213 d.val = __lsx_vpickod_h(bd_hi, bd_lo);
2218 __m128i p0 = __lsx_vld(ptr, 0);
2219 __m128i p1 = __lsx_vld(ptr, 16);
2220 __m128i p2 = __lsx_vld(ptr, 32);
2221 __m128i p3 = __lsx_vld(ptr, 48);
2223 __m128i t0 = __lsx_vilvl_w(p1, p0);
2224 __m128i t1 = __lsx_vilvl_w(p3, p2);
2225 __m128i t2 = __lsx_vilvh_w(p1, p0);
2226 __m128i t3 = __lsx_vilvh_w(p3, p2);
2227 a.val = __lsx_vilvl_d(t1, t0);
2228 b.val = __lsx_vilvh_d(t1, t0);
2229 c.val = __lsx_vilvl_d(t3, t2);
2230 d.val = __lsx_vilvh_d(t3, t2);
2235 __m128i t0 = __lsx_vld(ptr, 0);
2236 __m128i t1 = __lsx_vld(ptr, 16);
2237 __m128i t2 = __lsx_vld(ptr, 32);
2238 __m128i t3 = __lsx_vld(ptr, 48);
2240 a.val = __lsx_vilvl_d(t2, t0);
2241 b.val = __lsx_vilvh_d(t2, t0);
2242 c.val = __lsx_vilvl_d(t3, t1);
2243 d.val = __lsx_vilvh_d(t3, t1);
2251 __m128i v0 = __lsx_vilvl_b(b.val, a.val);
2252 __m128i v1 = __lsx_vilvh_b(b.val, a.val);
2254 __lsx_vst(v0, ptr, 0);
2255 __lsx_vst(v1, ptr, 16);
2261 __m128i v0 = __lsx_vilvl_h(b.val, a.val);
2262 __m128i v1 = __lsx_vilvh_h(b.val, a.val);
2264 __lsx_vst(v0, ptr, 0);
2265 __lsx_vst(v1, ptr, 16);
2271 __m128i v0 = __lsx_vilvl_w(b.val, a.val);
2272 __m128i v1 = __lsx_vilvh_w(b.val, a.val);
2274 __lsx_vst(v0, ptr, 0);
2275 __lsx_vst(v1, ptr, 16);
2281 __m128i v0 = __lsx_vilvl_d(b.val, a.val);
2282 __m128i v1 = __lsx_vilvh_d(b.val, a.val);
2284 __lsx_vst(v0, ptr, 0);
2285 __lsx_vst(v1, ptr, 16);
2291 __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
2292 __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
2293 __m128i v_c = c.val;
2294 const __m128i shuff0 = _v128_setr_b(0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10);
2295 const __m128i shuff1 = _v128_setr_b(11, 21, 12, 13, 22, 14, 15, 23, 0, 0, 0, 0, 0, 0, 0, 0);
2296 const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 24, 18, 19, 25, 20, 21);
2297 const __m128i shuff3 = _v128_setr_b(26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31);
2298 __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
2300 __m128i
dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
2301 __m128i
dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
2302 __m128i
dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
2303 dst1 = __lsx_vshuf_b(abc,
dst1, shuff2);
2305 __lsx_vst(
dst0, ptr, 0);
2306 __lsx_vst(
dst1, ptr, 16);
2307 __lsx_vst(
dst2, ptr, 32);
2313 __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
2314 __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
2315 __m128i v_c = c.val;
2316 const __m128i shuff0 = _v128_setr_b(0, 1, 2, 3, 16, 17, 4, 5, 6, 7, 18, 19, 8, 9, 10, 11);
2317 const __m128i shuff1 = _v128_setr_b(20, 21, 12, 13, 14, 15, 22, 23, 0, 0, 0, 0, 0, 0, 0, 0);
2318 const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 24, 25, 20, 21);
2319 const __m128i shuff3 = _v128_setr_b(6, 7, 26, 27, 8, 9, 10, 11, 28, 29, 12, 13, 14, 15, 30, 31);
2320 __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
2322 __m128i
dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
2323 __m128i
dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
2324 __m128i
dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
2325 dst1 = __lsx_vshuf_b(abc,
dst1, shuff2);
2327 __lsx_vst(
dst0, ptr, 0);
2328 __lsx_vst(
dst1, ptr, 16);
2329 __lsx_vst(
dst2, ptr, 32);
2335 __m128i v_c = c.val;
2336 __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);
2337 __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);
2338 __m128i bc_od = __lsx_vpackod_w(v_c, b.val);
2340 __m128i
dst0 = __lsx_vshuf4i_w(ab_lo, 0xB4);
2341 __m128i
dst1 = __lsx_vilvl_d(ab_hi, bc_od);
2342 __m128i
dst2 = __lsx_vpermi_w(bc_od, ab_hi, 0xE8);
2344 dst0 = __lsx_vextrins_w(
dst0, v_c, 0x20);
2345 dst2 = __lsx_vextrins_w(
dst2, v_c, 0x2);
2346 __lsx_vst(
dst0, ptr, 0);
2347 __lsx_vst(
dst1, ptr, 16);
2348 __lsx_vst(
dst2, ptr, 32);
2354 __m128i
dst0 = __lsx_vilvl_d(b.val, a.val);
2355 __m128i
dst1 = __lsx_vpermi_w(a.val, c.val, 0xE4);
2356 __m128i
dst2 = __lsx_vilvh_d(c.val, b.val);
2358 __lsx_vst(
dst0, ptr, 0);
2359 __lsx_vst(
dst1, ptr, 16);
2360 __lsx_vst(
dst2, ptr, 32);
2367 __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
2368 __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
2369 __m128i cd_lo = __lsx_vilvl_b(d.val, c.val);
2370 __m128i cd_hi = __lsx_vilvh_b(d.val, c.val);
2372 __m128i
dst0 = __lsx_vilvl_h(cd_lo, ab_lo);
2373 __m128i
dst1 = __lsx_vilvh_h(cd_lo, ab_lo);
2374 __m128i
dst2 = __lsx_vilvl_h(cd_hi, ab_hi);
2375 __m128i
dst3 = __lsx_vilvh_h(cd_hi, ab_hi);
2377 __lsx_vst(
dst0, ptr, 0);
2378 __lsx_vst(
dst1, ptr, 16);
2379 __lsx_vst(
dst2, ptr, 32);
2380 __lsx_vst(
dst3, ptr, 48);
2387 __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
2388 __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
2389 __m128i cd_lo = __lsx_vilvl_h(d.val, c.val);
2390 __m128i cd_hi = __lsx_vilvh_h(d.val, c.val);
2392 __m128i
dst0 = __lsx_vilvl_w(cd_lo, ab_lo);
2393 __m128i
dst1 = __lsx_vilvh_w(cd_lo, ab_lo);
2394 __m128i
dst2 = __lsx_vilvl_w(cd_hi, ab_hi);
2395 __m128i
dst3 = __lsx_vilvh_w(cd_hi, ab_hi);
2397 __lsx_vst(
dst0, ptr, 0);
2398 __lsx_vst(
dst1, ptr, 16);
2399 __lsx_vst(
dst2, ptr, 32);
2400 __lsx_vst(
dst3, ptr, 48);
2407 __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);
2408 __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);
2409 __m128i cd_lo = __lsx_vilvl_w(d.val, c.val);
2410 __m128i cd_hi = __lsx_vilvh_w(d.val, c.val);
2412 __m128i
dst0 = __lsx_vilvl_d(cd_lo, ab_lo);
2413 __m128i
dst1 = __lsx_vilvh_d(cd_lo, ab_lo);
2414 __m128i
dst2 = __lsx_vilvl_d(cd_hi, ab_hi);
2415 __m128i
dst3 = __lsx_vilvh_d(cd_hi, ab_hi);
2417 __lsx_vst(
dst0, ptr, 0);
2418 __lsx_vst(
dst1, ptr, 16);
2419 __lsx_vst(
dst2, ptr, 32);
2420 __lsx_vst(
dst3, ptr, 48);
2427 __m128i
dst0 = __lsx_vilvl_d(b.val, a.val);
2428 __m128i
dst2 = __lsx_vilvh_d(b.val, a.val);
2429 __m128i
dst1 = __lsx_vilvl_d(d.val, c.val);
2430 __m128i
dst3 = __lsx_vilvh_d(d.val, c.val);
2432 __lsx_vst(
dst0, ptr, 0);
2433 __lsx_vst(
dst1, ptr, 16);
2434 __lsx_vst(
dst2, ptr, 32);
2435 __lsx_vst(
dst3, ptr, 48);
2438#define OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2439inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0) \
2442 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2443 a0 = v_reinterpret_as_##suffix0(a1); \
2444 b0 = v_reinterpret_as_##suffix0(b1); \
2446inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0) \
2448 _Tpvec1 a1, b1, c1; \
2449 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2450 a0 = v_reinterpret_as_##suffix0(a1); \
2451 b0 = v_reinterpret_as_##suffix0(b1); \
2452 c0 = v_reinterpret_as_##suffix0(c1); \
2454inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, \
2455 _Tpvec0& c0, _Tpvec0& d0) \
2457 _Tpvec1 a1, b1, c1, d1; \
2458 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2459 a0 = v_reinterpret_as_##suffix0(a1); \
2460 b0 = v_reinterpret_as_##suffix0(b1); \
2461 c0 = v_reinterpret_as_##suffix0(c1); \
2462 d0 = v_reinterpret_as_##suffix0(d1); \
2464inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2465 hal::StoreMode =hal::STORE_UNALIGNED) \
2467 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2468 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2469 v_store_interleave((_Tp1*)ptr, a1, b1); \
2471inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0,\
2472 hal::StoreMode =hal::STORE_UNALIGNED) \
2474 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2475 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2476 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2477 v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
2479inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2480 const _Tpvec0& c0, const _Tpvec0& d0, \
2481 hal::StoreMode =hal::STORE_UNALIGNED) \
2483 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2484 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2485 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2486 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2487 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
2504 return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0)));
2507 for (
int i = 0; i < 4; i++)
2508 buf[i] = (
float)ptr[i];
2516 __m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val);
2517 __lsx_vstelm_d(res, ptr, 0, 0);
2521 for (
int i = 0; i < 4; i++)
2522 ptr[i] = hfloat(buf[i]);
2532CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
const int * idx
Definition core_c.h:668
CvArr * dst0
Definition core_c.h:988
const CvArr * vec2
Definition core_c.h:1429
CvArr CvArr CvArr CvArr * dst3
Definition core_c.h:989
CvArr CvArr * dst1
Definition core_c.h:988
const CvArr CvArr * x
Definition core_c.h:1195
CvArr CvArr CvArr * dst2
Definition core_c.h:989
const CvArr const CvArr CvArr * result
Definition core_c.h:1423
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition intrin_cpp.hpp:2216
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition intrin_cpp.hpp:1451
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition intrin_cpp.hpp:2761
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
StoreMode
Definition intrin.hpp:100
@ STORE_UNALIGNED
Definition intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition dualquaternion.inl.hpp:274