5 #ifndef OPENCV_HAL_INTRIN_LASX_HPP
6 #define OPENCV_HAL_INTRIN_LASX_HPP
9 #include <lasxintrin.h>
12 #define CV_SIMD256_64F 1
13 #define CV_SIMD256_FP16 0
20 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
24 inline __m256i _v256_setr_b(
char v0,
char v1,
char v2,
char v3,
char v4,
char v5,
char v6,
char v7,
char v8,
char v9,
25 char v10,
char v11,
char v12,
char v13,
char v14,
char v15,
char v16,
char v17,
char v18,
char v19,
26 char v20,
char v21,
char v22,
char v23,
char v24,
char v25,
char v26,
char v27,
char v28,
char v29,
29 return (__m256i)v32i8{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
30 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
31 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
35 inline __m256i _v256_set_b(
char v0,
char v1,
char v2,
char v3,
char v4,
char v5,
char v6,
char v7,
char v8,
char v9,
36 char v10,
char v11,
char v12,
char v13,
char v14,
char v15,
char v16,
char v17,
char v18,
char v19,
37 char v20,
char v21,
char v22,
char v23,
char v24,
char v25,
char v26,
char v27,
char v28,
char v29,
40 return (__m256i)v32i8{ v31, v30,
41 v29, v28, v27, v26, v25, v24, v23, v22, v21, v20,
42 v19, v18, v17, v16, v15, v14, v13, v12, v11, v10,
43 v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 };
46 inline __m256i _v256_setr_h(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7,
47 short v8,
short v9,
short v10,
short v11,
short v12,
short v13,
short v14,
short v15)
49 return (__m256i)v16i16{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 };
52 inline __m256i _v256_setr_w(
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7)
54 return (__m256i)v8i32{ v0, v1, v2, v3, v4, v5, v6, v7 };
57 inline __m256i _v256_set_w(
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7)
59 return (__m256i)v8i32{ v7, v6, v5, v4, v3, v2, v1, v0 };
62 inline __m256i _v256_setall_w(
int v0)
64 return (__m256i)v8i32{ v0, v0, v0, v0, v0, v0, v0, v0 };
69 return (__m256i)v4i64{ v0, v1, v2, v3 };
74 return (__m256i)v4i64{ v3, v2, v1, v0 };
77 inline __m256 _v256_setr_ps(
float v0,
float v1,
float v2,
float v3,
float v4,
float v5,
float v6,
float v7)
79 return (__m256)v8f32{ v0, v1, v2, v3, v4, v5, v6, v7 };
82 inline __m256 _v256_setall_ps(
float f32)
84 return (__m256)v8f32{ f32, f32, f32, f32, f32, f32, f32, f32 };
87 inline __m256d _v256_setr_pd(
double v0,
double v1,
double v2,
double v3)
89 return (__m256d)v4f64{ v0, v1, v2, v3 };
92 inline __m256d _v256_setall_pd(
double f64)
94 return (__m256d)v4f64{ f64, f64, f64, f64 };
97 inline __m256i _lasx_packus_h(
const __m256i& a,
const __m256i& b)
99 return __lasx_xvssrarni_bu_h(b, a, 0);
102 inline __m256i _lasx_packs_h(
const __m256i& a,
const __m256i& b)
104 return __lasx_xvssrarni_b_h(b, a, 0);
107 inline __m256i _lasx_packus_w(
const __m256i& a,
const __m256i& b)
109 return __lasx_xvssrarni_hu_w(b, a, 0);
112 inline __m256i _lasx_packs_w(
const __m256i& a,
const __m256i& b)
114 return __lasx_xvssrarni_h_w(b, a, 0);
117 inline __m256i _v256_combine(
const __m128i& lo,
const __m128i& hi)
118 {
return __lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02); }
120 inline __m256 _v256_combine(
const __m128& lo,
const __m128& hi)
121 {
return __m256(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
123 inline __m256d _v256_combine(
const __m128d& lo,
const __m128d& hi)
124 {
return __m256d(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
126 inline __m256i _v256_shuffle_odd_64(
const __m256i& v)
127 {
return __lasx_xvpermi_d(v, 0xd8); }
129 inline __m256d _v256_shuffle_odd_64(
const __m256d& v)
130 {
return __m256d(__lasx_xvpermi_d(*((__m256i*)&v), 0xd8)); }
134 inline __m256i _v256_permute2x128(
const __m256i& a,
const __m256i& b)
135 {
return __lasx_xvpermi_q(a, b, imm); }
138 inline __m256 _v256_permute2x128(
const __m256& a,
const __m256& b)
139 {
return __m256(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
142 inline __m256d _v256_permute2x128(
const __m256d& a,
const __m256d& b)
143 {
return __m256d(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
145 template<
int imm,
typename _Tpvec>
146 inline _Tpvec v256_permute2x128(
const _Tpvec& a,
const _Tpvec& b)
147 {
return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
150 inline __m256i _v256_permute4x64(
const __m256i& a)
151 {
return __lasx_xvpermi_d(a, imm); }
154 inline __m256d _v256_permute4x64(
const __m256d& a)
155 {
return __m256d(__lasx_xvpermi_d(*((__m256i*)&a), imm)); }
157 template<
int imm,
typename _Tpvec>
158 inline _Tpvec v256_permute4x64(
const _Tpvec& a)
159 {
return _Tpvec(_v256_permute4x64<imm>(a.val)); }
161 inline __m128i _v256_extract_high(
const __m256i& v)
162 { __m256i temp256i = __lasx_xvpermi_d(v, 0x4E);
163 return *((__m128i*)&temp256i); }
165 inline __m128 _v256_extract_high(
const __m256& v)
166 {
return __m128(_v256_extract_high(*((__m256i*)&v))); }
168 inline __m128d _v256_extract_high(
const __m256d& v)
169 {
return __m128d(_v256_extract_high(*((__m256i*)&v))); }
171 inline __m128i _v256_extract_low(
const __m256i& v)
172 {
return *((__m128i*)&v); }
174 inline __m128 _v256_extract_low(
const __m256& v)
175 {
return __m128(_v256_extract_low(*((__m256i*)&v))); }
177 inline __m128d _v256_extract_low(
const __m256d& v)
178 {
return __m128d(_v256_extract_low(*((__m256i*)&v))); }
180 inline __m256i _v256_packs_epu32(
const __m256i& a,
const __m256i& b)
182 return __lasx_xvssrlrni_hu_w(b, a, 0);
186 inline int _v256_extract_b(
const __m256i& a)
189 __lasx_xvstelm_b(a, des, 0, i);
194 inline int _v256_extract_h(
const __m256i& a)
197 __lasx_xvstelm_h(a, des, 0, i);
202 inline int _v256_extract_w(
const __m256i& a)
204 return __lasx_xvpickve2gr_w(a, i);
208 inline int64 _v256_extract_d(
const __m256i& a)
210 return __lasx_xvpickve2gr_d(a, i);
217 typedef uchar lane_type;
218 enum { nlanes = 32 };
221 explicit v_uint8x32(__m256i v) : val(v) {}
231 val = _v256_setr_b((
char)v0, (
char)v1, (
char)v2, (
char)v3,
232 (
char)v4, (
char)v5, (
char)v6 , (
char)v7, (
char)v8, (
char)v9,
233 (
char)v10, (
char)v11, (
char)v12, (
char)v13, (
char)v14, (
char)v15,
234 (
char)v16, (
char)v17, (
char)v18, (
char)v19, (
char)v20, (
char)v21,
235 (
char)v22, (
char)v23, (
char)v24, (
char)v25, (
char)v26, (
char)v27,
236 (
char)v28, (
char)v29, (
char)v30, (
char)v31);
243 __lasx_xvstelm_b(val, des, 0, 0);
250 typedef schar lane_type;
251 enum { nlanes = 32 };
254 explicit v_int8x32(__m256i v) : val(v) {}
264 val = _v256_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
265 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
266 v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
273 __lasx_xvstelm_b(val, des, 0, 0);
281 enum { nlanes = 16 };
284 explicit v_uint16x16(__m256i v) : val(v) {}
290 val = _v256_setr_h((
short)v0, (
short)v1, (
short)v2, (
short)v3,
291 (
short)v4, (
short)v5, (
short)v6, (
short)v7, (
short)v8, (
short)v9,
292 (
short)v10, (
short)v11, (
short)v12, (
short)v13, (
short)v14, (
short)v15);
299 __lasx_xvstelm_h(val, des, 0, 0);
306 typedef short lane_type;
307 enum { nlanes = 16 };
310 explicit v_int16x16(__m256i v) : val(v) {}
311 v_int16x16(
short v0,
short v1,
short v2,
short v3,
312 short v4,
short v5,
short v6,
short v7,
313 short v8,
short v9,
short v10,
short v11,
314 short v12,
short v13,
short v14,
short v15)
316 val = _v256_setr_h(v0, v1, v2, v3, v4, v5, v6, v7,
317 v8, v9, v10, v11, v12, v13, v14, v15);
324 __lasx_xvstelm_h(val, des, 0, 0);
331 typedef unsigned lane_type;
335 explicit v_uint32x8(__m256i v) : val(v) {}
336 v_uint32x8(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3,
337 unsigned v4,
unsigned v5,
unsigned v6,
unsigned v7)
339 val = _v256_setr_w((
unsigned)v0, (
unsigned)v1, (
unsigned)v2,
340 (
unsigned)v3, (
unsigned)v4, (
unsigned)v5, (
unsigned)v6, (
unsigned)v7);
345 unsigned get0()
const {
return __lasx_xvpickve2gr_wu(val, 0); }
350 typedef int lane_type;
354 explicit v_int32x8(__m256i v) : val(v) {}
355 v_int32x8(
int v0,
int v1,
int v2,
int v3,
356 int v4,
int v5,
int v6,
int v7)
358 val = _v256_setr_w(v0, v1, v2, v3, v4, v5, v6, v7);
363 int get0()
const {
return __lasx_xvpickve2gr_w(val, 0); }
368 typedef float lane_type;
372 explicit v_float32x8(__m256 v) : val(v) {}
373 explicit v_float32x8(__m256i v) { val = *((__m256*)&v); }
374 v_float32x8(
float v0,
float v1,
float v2,
float v3,
375 float v4,
float v5,
float v6,
float v7)
377 val = _v256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
384 __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
388 int get0toint()
const {
390 __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
401 explicit v_uint64x4(__m256i v) : val(v) {}
409 return __lasx_xvpickve2gr_du(val, 0);
415 typedef int64 lane_type;
419 explicit v_int64x4(__m256i v) : val(v) {}
421 { val = _v256_setr_d(v0, v1, v2, v3); }
427 return __lasx_xvpickve2gr_d(val, 0);
433 typedef double lane_type;
437 explicit v_float64x4(__m256d v) : val(v) {}
438 explicit v_float64x4(__m256i v) { val = *((__m256d*)&v); }
439 v_float64x4(
double v0,
double v1,
double v2,
double v3)
440 { val = _v256_setr_pd(v0, v1, v2, v3); }
444 double get0()
const {
446 __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
450 int64 get0toint64()
const {
452 __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
459 #define OPENCV_HAL_IMPL_LASX_LOADSTORE(_Tpvec, _Tp) \
460 inline _Tpvec v256_load(const _Tp* ptr) \
461 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
462 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
463 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
464 inline _Tpvec v256_load_low(const _Tp* ptr) \
466 __m128i v128 = __lsx_vld(ptr, 0); \
467 return _Tpvec(*((__m256i*)&v128)); \
469 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
471 __m128i vlo = __lsx_vld(ptr0, 0); \
472 __m128i vhi = __lsx_vld(ptr1, 0); \
473 return _Tpvec(_v256_combine(vlo, vhi)); \
475 inline void v_store(_Tp* ptr, const _Tpvec& a) \
476 { __lasx_xvst(a.val, ptr, 0); } \
477 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
478 { __lasx_xvst(a.val, ptr, 0); } \
479 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
480 { __lasx_xvst(a.val, ptr, 0); } \
481 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
483 if( mode == hal::STORE_UNALIGNED ) \
484 __lasx_xvst(a.val, ptr, 0); \
485 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
486 __lasx_xvst(a.val, ptr, 0); \
488 __lasx_xvst(a.val, ptr, 0); \
490 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
491 { __lsx_vst(_v256_extract_low(a.val), ptr, 0); } \
492 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
493 { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
495 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint8x32,
uchar)
496 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int8x32,
schar)
497 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint16x16,
ushort)
498 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int16x16,
short)
499 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint32x8,
unsigned)
500 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int32x8,
int)
501 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint64x4,
uint64)
502 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int64x4,
int64)
505 #define OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg) \
506 inline _Tpvec v256_load(const _Tp* ptr) \
507 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
508 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
509 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
510 inline _Tpvec v256_load_low(const _Tp* ptr) \
512 __m128i v128 = __lsx_vld(ptr, 0); \
513 return _Tpvec(*((__m256i*)&v128)); \
515 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
517 halfreg vlo = __lsx_vld(ptr0, 0); \
518 halfreg vhi = __lsx_vld(ptr1, 0); \
519 return _Tpvec(_v256_combine(vlo, vhi)); \
521 inline void v_store(_Tp* ptr, const _Tpvec& a) \
522 { __lasx_xvst(a.val, ptr, 0); } \
523 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
524 { __lasx_xvst(a.val, ptr, 0); } \
525 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
526 { __lasx_xvst(a.val, ptr, 0); } \
527 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
529 if( mode == hal::STORE_UNALIGNED ) \
530 __lasx_xvst(a.val, ptr, 0); \
531 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
532 __lasx_xvst(a.val, ptr, 0); \
534 __lasx_xvst(a.val, ptr, 0); \
536 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
537 { __lsx_vst(_v256_extract_low(a.val), ptr, 0); } \
538 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
539 { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
541 OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float32x8,
float, __m128i)
542 OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float64x4,
double, __m128i)
545 inline __m256i _lasx_256_castps_si256(
const __m256& v)
546 {
return __m256i(v); }
548 inline __m256i _lasx_256_castpd_si256(
const __m256d& v)
549 {
return __m256i(v); }
551 #define OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
552 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
553 { return _Tpvec(cast(a.val)); }
555 #define OPENCV_HAL_IMPL_LASX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
556 inline _Tpvec v256_setzero_##suffix() \
557 { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
558 inline _Tpvec v256_setall_##suffix(_Tp v) \
559 { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); } \
560 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
561 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
562 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
563 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
564 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
565 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
566 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
567 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
568 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float32x8, suffix, _lasx_256_castps_si256) \
569 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float64x4, suffix, _lasx_256_castpd_si256)
571 OPENCV_HAL_IMPL_LASX_INIT(v_uint8x32,
uchar, u8, b,
int)
572 OPENCV_HAL_IMPL_LASX_INIT(v_int8x32,
schar, s8, b,
int)
573 OPENCV_HAL_IMPL_LASX_INIT(v_uint16x16,
ushort, u16, h,
int)
574 OPENCV_HAL_IMPL_LASX_INIT(v_int16x16,
short, s16, h,
int)
575 OPENCV_HAL_IMPL_LASX_INIT(v_uint32x8,
unsigned, u32, w,
int)
576 OPENCV_HAL_IMPL_LASX_INIT(v_int32x8,
int, s32, w,
int)
577 OPENCV_HAL_IMPL_LASX_INIT(v_uint64x4,
uint64, u64, d,
long int)
578 OPENCV_HAL_IMPL_LASX_INIT(v_int64x4,
int64, s64, d,
long int)
581 inline __m256 _lasx_256_castsi256_ps(
const __m256i &v)
582 {
return __m256(v); }
584 inline __m256d _lasx_256_castsi256_pd(
const __m256i &v)
585 {
return __m256d(v); }
587 #define OPENCV_HAL_IMPL_LASX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
588 inline _Tpvec v256_setzero_##suffix() \
589 { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
590 inline _Tpvec v256_setall_##suffix(_Tp v) \
591 { return _Tpvec(_v256_setall_##zsuffix(v)); } \
592 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
593 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, cast) \
594 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
595 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16, suffix, cast) \
596 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
597 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8, suffix, cast) \
598 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
599 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4, suffix, cast)
601 OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float32x8,
float, f32, ps, _lasx_256_castsi256_ps)
602 OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float64x4,
double, f64, pd, _lasx_256_castsi256_pd)
604 inline v_float32x8 v_reinterpret_as_f32(
const v_float32x8& a)
606 inline v_float32x8 v_reinterpret_as_f32(
const v_float64x4& a)
607 {
return v_float32x8(_lasx_256_castps_si256(__m256(a.val))); }
609 inline v_float64x4 v_reinterpret_as_f64(
const v_float64x4& a)
611 inline v_float64x4 v_reinterpret_as_f64(
const v_float32x8& a)
612 {
return v_float64x4(_lasx_256_castpd_si256(__m256d(a.val))); }
618 #define OPENCV_HAL_IMPL_LASX_UNPACK(_Tpvec, suffix) \
619 inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
620 { return _Tpvec(__lasx_xvilvl_##suffix(__m256i(b.val), __m256i(a.val))); } \
621 inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
622 { return _Tpvec(__lasx_xvilvh_##suffix(__m256i(b.val), __m256i(a.val))); }
624 OPENCV_HAL_IMPL_LASX_UNPACK(v_uint8x32, b)
625 OPENCV_HAL_IMPL_LASX_UNPACK(v_int8x32, b)
626 OPENCV_HAL_IMPL_LASX_UNPACK(v_uint16x16, h)
627 OPENCV_HAL_IMPL_LASX_UNPACK(v_int16x16, h)
628 OPENCV_HAL_IMPL_LASX_UNPACK(v_uint32x8, w)
629 OPENCV_HAL_IMPL_LASX_UNPACK(v_int32x8, w)
630 OPENCV_HAL_IMPL_LASX_UNPACK(v_uint64x4, d)
631 OPENCV_HAL_IMPL_LASX_UNPACK(v_int64x4, d)
632 OPENCV_HAL_IMPL_LASX_UNPACK(v_float32x8, w)
633 OPENCV_HAL_IMPL_LASX_UNPACK(v_float64x4, d)
638 #define OPENCV_HAL_IMPL_LASX_SHUFFLE(_Tpvec, intrin) \
640 inline _Tpvec v256_shuffle(const _Tpvec& a) \
641 { return _Tpvec(__lasx_xvshuf4i_##intrin(a.val, m)); }
643 OPENCV_HAL_IMPL_LASX_SHUFFLE(v_uint32x8, w)
644 OPENCV_HAL_IMPL_LASX_SHUFFLE(v_int32x8, w)
647 inline v_float32x8 v256_shuffle(
const v_float32x8 &a)
648 {
return v_float32x8(__lasx_xvshuf4i_w(*((__m256i*)&a.val), m)); }
651 inline v_float64x4 v256_shuffle(
const v_float64x4 &a)
653 int imm8 = m & 0b0001;
654 if (m & 0x0b0010) imm8 |= 0b0100;
656 if (m & 0x0b0100) imm8 |= 0b110000;
657 else imm8 |= 0b100000;
658 if (m & 0x0b1000) imm8 |= 0b11000000;
659 else imm8 |= 0b10000000;
661 return v_float64x4(__lasx_xvpermi_d(*((__m256i*)&a.val), imm8));
663 template<
typename _Tpvec>
664 inline void v256_zip(
const _Tpvec& a,
const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
666 ab0 = v256_unpacklo(a, b);
667 ab1 = v256_unpackhi(a, b);
670 template<
typename _Tpvec>
671 inline _Tpvec v256_combine_diagonal(
const _Tpvec& a,
const _Tpvec& b)
672 {
return _Tpvec(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
674 inline v_float32x8 v256_combine_diagonal(
const v_float32x8& a,
const v_float32x8& b)
675 {
return v_float32x8(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
677 inline v_float64x4 v256_combine_diagonal(
const v_float64x4& a,
const v_float64x4& b)
678 {
return v_float64x4(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
680 template<
typename _Tpvec>
681 inline _Tpvec v256_alignr_128(
const _Tpvec& a,
const _Tpvec& b)
682 {
return v256_permute2x128<0x03>(a, b); }
684 inline __m256i _v256_alignr_b(
const __m256i &a,
const __m256i &b,
const int imm)
687 return __lasx_xvshuf4i_d(b, a, 0x9);
689 __m256i byteIndex = _v256_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
690 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
691 return __lasx_xvshuf_b(a, b, __lasx_xvadd_b(__lasx_xvreplgr2vr_b(imm), byteIndex));
695 template<
typename _Tpvec>
696 inline _Tpvec v256_alignr_64(
const _Tpvec& a,
const _Tpvec& b)
697 {
return _Tpvec(_v256_alignr_b(a.val, b.val, 8)); }
698 inline v_float64x4 v256_alignr_64(
const v_float64x4& a,
const v_float64x4& b)
699 {
return v_float64x4(__lasx_xvshuf4i_d(b.val, a.val, 0x9)); }
702 template<
typename _Tpvec>
703 inline _Tpvec v256_swap_halves(
const _Tpvec& a)
704 {
return v256_permute2x128<1>(a, a); }
706 template<
typename _Tpvec>
707 inline _Tpvec v256_reverse_64(
const _Tpvec& a)
708 {
return v256_permute4x64<0x1b>(a); }
712 #define OPENCV_HAL_IMPL_LASX_ZIP(_Tpvec) \
713 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
714 { return v256_permute2x128<0x02>(a, b); } \
715 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
716 { return v256_permute2x128<0x13>(a, b); } \
717 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
718 _Tpvec& c, _Tpvec& d) \
720 _Tpvec a1b0 = v256_alignr_128(a, b); \
721 c = v256_combine_diagonal(a, a1b0); \
722 d = v256_combine_diagonal(a1b0, b); \
724 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
725 _Tpvec& ab0, _Tpvec& ab1) \
727 _Tpvec ab0ab2, ab1ab3; \
728 v256_zip(a, b, ab0ab2, ab1ab3); \
729 v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
732 OPENCV_HAL_IMPL_LASX_ZIP(v_uint8x32)
733 OPENCV_HAL_IMPL_LASX_ZIP(v_int8x32)
734 OPENCV_HAL_IMPL_LASX_ZIP(v_uint16x16)
735 OPENCV_HAL_IMPL_LASX_ZIP(v_int16x16)
736 OPENCV_HAL_IMPL_LASX_ZIP(v_uint32x8)
737 OPENCV_HAL_IMPL_LASX_ZIP(v_int32x8)
738 OPENCV_HAL_IMPL_LASX_ZIP(v_uint64x4)
739 OPENCV_HAL_IMPL_LASX_ZIP(v_int64x4)
740 OPENCV_HAL_IMPL_LASX_ZIP(v_float32x8)
741 OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
746 #define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin) \
747 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
748 { return _Tpvec(intrin(a.val, b.val)); } \
749 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
750 { a.val = intrin(a.val, b.val); return a; }
752 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32, __lasx_xvsadd_bu)
753 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32, __lasx_xvssub_bu)
754 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32, __lasx_xvsadd_b)
755 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32, __lasx_xvssub_b)
756 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
757 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
758 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16, __lasx_xvsadd_h)
759 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16, __lasx_xvssub_h)
760 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8, __lasx_xvadd_w)
761 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8, __lasx_xvsub_w)
762 OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8, __lasx_xvmul_w)
763 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8, __lasx_xvadd_w)
764 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8, __lasx_xvsub_w)
765 OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8, __lasx_xvmul_w)
766 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4, __lasx_xvadd_d)
767 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4, __lasx_xvsub_d)
768 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4, __lasx_xvadd_d)
769 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4, __lasx_xvsub_d)
771 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
772 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
773 OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
774 OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
775 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
776 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
777 OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
778 OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)
781 inline v_uint8x32
operator * (
const v_uint8x32& a,
const v_uint8x32& b)
787 inline v_int8x32
operator * (
const v_int8x32& a,
const v_int8x32& b)
793 inline v_uint16x16
operator * (
const v_uint16x16& a,
const v_uint16x16& b)
795 __m256i pl = __lasx_xvmul_h(a.val, b.val);
796 __m256i ph = __lasx_xvmuh_hu(a.val, b.val);
797 __m256i p0 = __lasx_xvilvl_h(ph, pl);
798 __m256i p1 = __lasx_xvilvh_h(ph, pl);
799 return v_uint16x16(_v256_packs_epu32(p0, p1));
801 inline v_int16x16
operator * (
const v_int16x16& a,
const v_int16x16& b)
803 __m256i pl = __lasx_xvmul_h(a.val, b.val);
804 __m256i ph = __lasx_xvmuh_h(a.val, b.val);
805 __m256i p0 = __lasx_xvilvl_h(ph, pl);
806 __m256i p1 = __lasx_xvilvh_h(ph, pl);
807 return v_int16x16(_lasx_packs_w(p0, p1));
809 inline v_uint8x32&
operator *= (v_uint8x32& a,
const v_uint8x32& b)
810 { a = a * b;
return a; }
811 inline v_int8x32&
operator *= (v_int8x32& a,
const v_int8x32& b)
812 { a = a * b;
return a; }
813 inline v_uint16x16&
operator *= (v_uint16x16& a,
const v_uint16x16& b)
814 { a = a * b;
return a; }
815 inline v_int16x16&
operator *= (v_int16x16& a,
const v_int16x16& b)
816 { a = a * b;
return a; }
820 #define OPENCV_HAL_IMPL_LASX_BIN_FUNC(func, _Tpvec, intrin) \
821 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
822 { return _Tpvec(intrin(a.val, b.val)); }
824 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint8x32, __lasx_xvadd_b)
825 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int8x32, __lasx_xvadd_b)
826 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint16x16, __lasx_xvadd_h)
827 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int16x16, __lasx_xvadd_h)
828 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint8x32, __lasx_xvsub_b)
829 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int8x32, __lasx_xvsub_b)
830 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint16x16, __lasx_xvsub_h)
831 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int16x16, __lasx_xvsub_h)
832 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_uint16x16, __lasx_xvmul_h)
833 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16, __lasx_xvmul_h)
835 inline v_uint8x32 v_mul_wrap(
const v_uint8x32& a,
const v_uint8x32& b)
837 __m256i p0 = __lasx_xvmulwev_h_bu(a.val, b.val);
838 __m256i p1 = __lasx_xvmulwod_h_bu(a.val, b.val);
839 return v_uint8x32(__lasx_xvpackev_b(p1, p0));
842 inline v_int8x32 v_mul_wrap(
const v_int8x32& a,
const v_int8x32& b)
844 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
848 inline void v_mul_expand(
const v_uint8x32& a,
const v_uint8x32& b,
849 v_uint16x16& c, v_uint16x16& d)
851 v_uint16x16 a0, a1, b0, b1;
854 c = v_mul_wrap(a0, b0);
855 d = v_mul_wrap(a1, b1);
858 inline void v_mul_expand(
const v_int8x32& a,
const v_int8x32& b,
859 v_int16x16& c, v_int16x16& d)
861 v_int16x16 a0, a1, b0, b1;
864 c = v_mul_wrap(a0, b0);
865 d = v_mul_wrap(a1, b1);
868 inline void v_mul_expand(
const v_int16x16& a,
const v_int16x16& b,
869 v_int32x8& c, v_int32x8& d)
871 v_int16x16 vhi = v_int16x16(__lasx_xvmuh_h(a.val, b.val));
874 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
876 c = v_reinterpret_as_s32(v0);
877 d = v_reinterpret_as_s32(v1);
880 inline void v_mul_expand(
const v_uint16x16& a,
const v_uint16x16& b,
881 v_uint32x8& c, v_uint32x8& d)
883 v_uint16x16 vhi = v_uint16x16(__lasx_xvmuh_hu(a.val, b.val));
886 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
888 c = v_reinterpret_as_u32(v0);
889 d = v_reinterpret_as_u32(v1);
892 inline void v_mul_expand(
const v_uint32x8& a,
const v_uint32x8& b,
893 v_uint64x4& c, v_uint64x4& d)
895 __m256i v0 = __lasx_xvmulwev_d_wu(a.val, b.val);
896 __m256i v1 = __lasx_xvmulwod_d_wu(a.val, b.val);
897 v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
900 inline v_int16x16
v_mul_hi(
const v_int16x16& a,
const v_int16x16& b) {
return v_int16x16(__lasx_xvmuh_h(a.val, b.val)); }
901 inline v_uint16x16
v_mul_hi(
const v_uint16x16& a,
const v_uint16x16& b) {
return v_uint16x16(__lasx_xvmuh_hu(a.val, b.val)); }
904 #define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
905 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
906 { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
907 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
908 { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
909 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
910 { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
911 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
912 { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
914 inline _Tpuvec v_shl(const _Tpuvec& a) \
915 { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
917 inline _Tpsvec v_shl(const _Tpsvec& a) \
918 { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
920 inline _Tpuvec v_shr(const _Tpuvec& a) \
921 { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
923 inline _Tpsvec v_shr(const _Tpsvec& a) \
924 { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }
926 OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h)
927 OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8, v_int32x8, w, __lasx_xvsra_w)
928 OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d)
932 #define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const) \
933 OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix) \
934 OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \
935 OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \
936 inline _Tpvec operator ~ (const _Tpvec& a) \
937 { return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
939 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1))
940 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32, v, __lasx_xvreplgr2vr_w(-1))
941 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint16x16, v, __lasx_xvreplgr2vr_w(-1))
942 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int16x16, v, __lasx_xvreplgr2vr_w(-1))
943 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint32x8, v, __lasx_xvreplgr2vr_w(-1))
944 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int32x8, v, __lasx_xvreplgr2vr_w(-1))
945 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4, v, __lasx_xvreplgr2vr_d(-1))
946 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4, v, __lasx_xvreplgr2vr_d(-1))
948 #define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
949 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
950 { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } \
951 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
952 { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }
954 #define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast) \
955 OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast) \
956 OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast) \
957 OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast) \
958 inline _Tpvec operator ~ (const _Tpvec& a) \
959 { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
961 OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8, v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
962 OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float64x4, v, __lasx_xvreplgr2vr_d(-1), _lasx_256_castsi256_pd)
965 #define OPENCV_HAL_IMPL_LASX_SELECT(_Tpvec) \
966 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
967 { return _Tpvec(__lasx_xvbitsel_v(b.val, a.val, mask.val)); }
969 OPENCV_HAL_IMPL_LASX_SELECT(v_uint8x32)
970 OPENCV_HAL_IMPL_LASX_SELECT(v_int8x32)
971 OPENCV_HAL_IMPL_LASX_SELECT(v_uint16x16)
972 OPENCV_HAL_IMPL_LASX_SELECT(v_int16x16)
973 OPENCV_HAL_IMPL_LASX_SELECT(v_uint32x8)
974 OPENCV_HAL_IMPL_LASX_SELECT(v_int32x8)
976 inline v_float32x8
v_select(
const v_float32x8 &
mask,
const v_float32x8 &a,
const v_float32x8 &b)
977 {
return v_float32x8(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&
mask.val))); }
979 inline v_float64x4
v_select(
const v_float64x4 &
mask,
const v_float64x4 &a,
const v_float64x4 &b)
980 {
return v_float64x4(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&
mask.val))); }
983 #define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec) \
984 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
985 { return ~(a == b); } \
986 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
988 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
989 { return ~(a < b); } \
990 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
993 #define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
994 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
995 { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
996 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
998 return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val)); \
1000 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1001 { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
1002 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1003 { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); } \
1004 OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec) \
1005 OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
1007 OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint8x32, v_int8x32, b, bu)
1008 OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
1009 OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8, v_int32x8, w, wu)
1011 #define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix) \
1012 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1013 { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
1014 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1015 { return ~(a == b); }
1017 OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
1018 OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
1020 #define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
1021 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1022 { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
1024 #define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix) \
1025 OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix) \
1026 OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix) \
1027 OPENCV_HAL_IMPL_LASX_CMP_FLT(<, xvfcmp_clt, _Tpvec, ssuffix) \
1028 OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)
1030 OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
1031 OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
1033 inline v_float32x8 operator > (
const v_float32x8 &a,
const v_float32x8 &b)
1034 {
return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
1036 inline v_float32x8 operator >= (
const v_float32x8 &a,
const v_float32x8 &b)
1037 {
return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
1039 inline v_float64x4 operator > (
const v_float64x4 &a,
const v_float64x4 &b)
1040 {
return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
1042 inline v_float64x4 operator >= (
const v_float64x4 &a,
const v_float64x4 &b)
1043 {
return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
1045 inline v_float32x8
v_not_nan(
const v_float32x8& a)
1046 {
return v_float32x8(__lasx_xvfcmp_cor_s(a.val, a.val)); }
1047 inline v_float64x4
v_not_nan(
const v_float64x4& a)
1048 {
return v_float64x4(__lasx_xvfcmp_cor_d(a.val, a.val)); }
1051 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint8x32, __lasx_xvmin_bu)
1052 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint8x32, __lasx_xvmax_bu)
1053 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int8x32, __lasx_xvmin_b)
1054 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int8x32, __lasx_xvmax_b)
1055 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint16x16, __lasx_xvmin_hu)
1056 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint16x16, __lasx_xvmax_hu)
1057 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int16x16, __lasx_xvmin_h)
1058 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int16x16, __lasx_xvmax_h)
1059 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint32x8, __lasx_xvmin_wu)
1060 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint32x8, __lasx_xvmax_wu)
1061 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int32x8, __lasx_xvmin_w)
1062 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int32x8, __lasx_xvmax_w)
1063 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float32x8, __lasx_xvfmin_s)
1064 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float32x8, __lasx_xvfmax_s)
1065 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float64x4, __lasx_xvfmin_d)
1066 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float64x4, __lasx_xvfmax_d)
1070 inline v_uint8x32 v_rotate_left(
const v_uint8x32& a,
const v_uint8x32& b)
1072 enum {IMM_R = (16 - imm) & 0xFF};
1073 enum {IMM_R2 = (32 - imm) & 0xFF};
1075 if (imm == 0)
return a;
1076 if (imm == 32)
return b;
1077 if (imm > 32)
return v_uint8x32();
1079 __m256i
swap = _v256_permute2x128<0x21>(a.val, b.val);
1080 if (imm == 16)
return v_uint8x32(
swap);
1081 if (imm < 16)
return v_uint8x32(_v256_alignr_b(a.val,
swap, IMM_R));
1082 return v_uint8x32(_v256_alignr_b(
swap, b.val, IMM_R2));
1086 inline v_uint8x32 v_rotate_right(
const v_uint8x32& a,
const v_uint8x32& b)
1088 enum {IMM_L = (imm - 16) & 0xFF};
1090 if (imm == 0)
return a;
1091 if (imm == 32)
return b;
1092 if (imm > 32)
return v_uint8x32();
1094 __m256i
swap = _v256_permute2x128<0x03>(a.val, b.val);
1095 if (imm == 16)
return v_uint8x32(
swap);
1096 if (imm < 16)
return v_uint8x32(_v256_alignr_b(
swap, a.val, imm));
1097 return v_uint8x32(_v256_alignr_b(b.val,
swap, IMM_L));
1101 inline v_uint8x32 v_rotate_left(
const v_uint8x32& a)
1103 enum {IMM_L = (imm - 16) & 0xFF};
1104 enum {IMM_R = (16 - imm) & 0xFF};
1106 if (imm == 0)
return a;
1107 if (imm > 32)
return v_uint8x32();
1110 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1111 __m256i swapz = __lasx_xvpermi_q(a.val, vzero, 0x20);;
1112 if (imm == 16)
return v_uint8x32(swapz);
1113 if (imm < 16)
return v_uint8x32(_v256_alignr_b(a.val, swapz, IMM_R));
1114 return v_uint8x32(__lasx_xvbsll_v(swapz, IMM_L));
1118 inline v_uint8x32 v_rotate_right(
const v_uint8x32& a)
1120 enum {IMM_L = (imm - 16) & 0xFF};
1122 if (imm == 0)
return a;
1123 if (imm > 32)
return v_uint8x32();
1126 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1127 __m256i swapz = __lasx_xvpermi_q(vzero, a.val, 0x21);;
1128 if (imm == 16)
return v_uint8x32(swapz);
1129 if (imm < 16)
return v_uint8x32(_v256_alignr_b(swapz, a.val, imm));
1130 return v_uint8x32(__lasx_xvbsrl_v(swapz, IMM_L));
1133 #define OPENCV_HAL_IMPL_LASX_ROTATE_CAST(intrin, _Tpvec, cast) \
1135 inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
1137 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1138 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a), \
1139 v_reinterpret_as_u8(b)); \
1140 return _Tpvec(cast(ret.val)); \
1143 inline _Tpvec intrin(const _Tpvec& a) \
1145 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1146 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a)); \
1147 return _Tpvec(cast(ret.val)); \
1150 #define OPENCV_HAL_IMPL_LASX_ROTATE(_Tpvec) \
1151 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
1152 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
1154 OPENCV_HAL_IMPL_LASX_ROTATE(v_int8x32)
1155 OPENCV_HAL_IMPL_LASX_ROTATE(v_uint16x16)
1156 OPENCV_HAL_IMPL_LASX_ROTATE(v_int16x16)
1157 OPENCV_HAL_IMPL_LASX_ROTATE(v_uint32x8)
1158 OPENCV_HAL_IMPL_LASX_ROTATE(v_int32x8)
1159 OPENCV_HAL_IMPL_LASX_ROTATE(v_uint64x4)
1160 OPENCV_HAL_IMPL_LASX_ROTATE(v_int64x4)
1162 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, v_float32x8, _lasx_256_castsi256_ps)
1163 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float32x8, _lasx_256_castsi256_ps)
1164 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, v_float64x4, _lasx_256_castsi256_pd)
1165 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float64x4, _lasx_256_castsi256_pd)
1168 inline v_uint8x32
v_reverse(
const v_uint8x32 &a)
1170 static const __m256i perm = _v256_setr_b(
1171 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
1172 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1173 __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
1174 return v_uint8x32(__lasx_xvpermi_q(vec, vec, 1));
1177 inline v_int8x32
v_reverse(
const v_int8x32 &a)
1178 {
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1180 inline v_uint16x16
v_reverse(
const v_uint16x16 &a)
1182 __m256i vec = __lasx_xvshuf4i_h(a.val, 0x1B);
1183 vec = __lasx_xvshuf4i_w(vec, 0x4E);
1184 return v_uint16x16(__lasx_xvpermi_d(vec, 0x4E));
1187 inline v_int16x16
v_reverse(
const v_int16x16 &a)
1188 {
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
1190 inline v_uint32x8
v_reverse(
const v_uint32x8 &a)
1192 __m256i vec = __lasx_xvshuf4i_w(a.val, 0x1B);
1193 return v_uint32x8(__lasx_xvpermi_d(vec, 0x4E));
1196 inline v_int32x8
v_reverse(
const v_int32x8 &a)
1197 {
return v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
1199 inline v_float32x8
v_reverse(
const v_float32x8 &a)
1200 {
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
1202 inline v_uint64x4
v_reverse(
const v_uint64x4 &a)
1204 return v_uint64x4(__lasx_xvpermi_d(a.val, 0x1b));
1207 inline v_int64x4
v_reverse(
const v_int64x4 &a)
1208 {
return v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
1210 inline v_float64x4
v_reverse(
const v_float64x4 &a)
1211 {
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
1219 __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val);
1220 __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1221 __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1222 __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1223 return (
unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1228 __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val);
1229 __m256i t2 = __lasx_xvhaddw_w_h(t1, t1);
1230 __m256i t3 = __lasx_xvhaddw_d_w(t2, t2);
1231 __m256i t4 = __lasx_xvhaddw_q_d(t3, t3);
1232 return (
int)(((v8i32)t4)[0]+((v8i32)t4)[4]);
1235 #define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \
1236 inline sctype v_reduce_##func(const _Tpvec& a) \
1238 __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
1239 val = intrin(val, __lsx_vbsrl_v(val,8)); \
1240 val = intrin(val, __lsx_vbsrl_v(val,4)); \
1241 val = intrin(val, __lsx_vbsrl_v(val,2)); \
1242 val = intrin(val, __lsx_vbsrl_v(val,1)); \
1243 return (sctype)__lsx_vpickve2gr_w(val, 0); \
1246 OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32,
uchar,
min, __lsx_vmin_bu)
1247 OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32,
schar,
min, __lsx_vmin_b)
1248 OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32,
uchar,
max, __lsx_vmax_bu)
1249 OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32,
schar,
max, __lsx_vmax_b)
1251 #define OPENCV_HAL_IMPL_LASX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1252 inline sctype v_reduce_##func(const _Tpvec& a) \
1254 __m128i v0 = _v256_extract_low(a.val); \
1255 __m128i v1 = _v256_extract_high(a.val); \
1256 v0 = intrin(v0, v1); \
1257 v0 = intrin(v0, __lsx_vbsrl_v(v0, 8)); \
1258 v0 = intrin(v0, __lsx_vbsrl_v(v0, 4)); \
1259 v0 = intrin(v0, __lsx_vbsrl_v(v0, 2)); \
1260 return (sctype) __lsx_vpickve2gr_w(v0, 0); \
1263 OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16,
ushort,
min, __lsx_vmin_hu)
1264 OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16,
short,
min, __lsx_vmin_h)
1265 OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16,
ushort,
max, __lsx_vmax_hu)
1266 OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16,
short,
max, __lsx_vmax_h)
1268 #define OPENCV_HAL_IMPL_LASX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1269 inline sctype v_reduce_##func(const _Tpvec& a) \
1271 __m128i v0 = _v256_extract_low(a.val); \
1272 __m128i v1 = _v256_extract_high(a.val); \
1273 v0 = intrin(v0, v1); \
1274 v0 = intrin(v0, __lsx_vbsrl_v(v0, 8)); \
1275 v0 = intrin(v0, __lsx_vbsrl_v(v0, 4)); \
1276 return (sctype) __lsx_vpickve2gr_w(v0, 0); \
1279 OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8,
unsigned,
min, __lsx_vmin_wu)
1280 OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8,
int,
min, __lsx_vmin_w)
1281 OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8,
unsigned,
max, __lsx_vmax_wu)
1282 OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8,
int,
max, __lsx_vmax_w)
1284 #define OPENCV_HAL_IMPL_LASX_REDUCE_FLT(func, intrin) \
1285 inline float v_reduce_##func(const v_float32x8& a) \
1287 __m128 v0 = _v256_extract_low(a.val); \
1288 __m128 v1 = _v256_extract_high(a.val); \
1289 v0 = intrin(v0, v1); \
1290 v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x0e))); \
1291 v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x01))); \
1292 float *fvalue = (float*)&v0; \
1296 OPENCV_HAL_IMPL_LASX_REDUCE_FLT(
min, __lsx_vfmin_s)
1297 OPENCV_HAL_IMPL_LASX_REDUCE_FLT(
max, __lsx_vfmax_s)
1301 __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val);
1302 __m256i t2 = __lasx_xvhaddw_q_d(t1, t1);
1303 return (
int)(((v8i32)t2)[0]+((v8i32)t2)[4]);
1317 float *pa = (
float*)&a;
1318 for (
int i = 0; i < 2; ++i) {
1319 result += pa[i*4] + pa[i*4+1] + pa[i*4+2] + pa[i*4+3];
1326 __m256i t0 = __lasx_xvhaddw_qu_du(a.val, a.val);
1327 return (
uint64)(((v4u64)t0)[0] + ((v4u64)t0)[2]);
1331 __m256i t0 = __lasx_xvhaddw_q_d(a.val, a.val);
1332 return (
int64)(((v4i64)t0)[0] + ((v4i64)t0)[2]);
1336 double *pa = (
double*)&a;
1337 return pa[0] + pa[1] + pa[2] + pa[3];
1340 inline v_float32x8
v_reduce_sum4(
const v_float32x8& a,
const v_float32x8& b,
1341 const v_float32x8& c,
const v_float32x8& d)
1343 float *pa = (
float*)&a;
1344 float *pb = (
float*)&b;
1345 float *pc = (
float*)&c;
1346 float *pd = (
float*)&d;
1348 float v0 = pa[0] + pa[1] + pa[2] + pa[3];
1349 float v1 = pb[0] + pb[1] + pb[2] + pb[3];
1350 float v2 = pc[0] + pc[1] + pc[2] + pc[3];
1351 float v3 = pd[0] + pd[1] + pd[2] + pd[3];
1352 float v4 = pa[4] + pa[5] + pa[6] + pa[7];
1353 float v5 = pb[4] + pb[5] + pb[6] + pb[7];
1354 float v6 = pc[4] + pc[5] + pc[6] + pc[7];
1355 float v7 = pd[4] + pd[5] + pd[6] + pd[7];
1356 return v_float32x8(v0, v1, v2, v3, v4, v5, v6, v7);
1359 inline unsigned v_reduce_sad(
const v_uint8x32& a,
const v_uint8x32& b)
1361 __m256i t0 = __lasx_xvabsd_bu(a.val, b.val);
1362 __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
1363 __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1364 __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1365 __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1366 return (
unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1368 inline unsigned v_reduce_sad(
const v_int8x32& a,
const v_int8x32& b)
1370 __m256i t0 = __lasx_xvabsd_b(a.val, b.val);
1371 __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
1372 __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1373 __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1374 __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1375 return (
unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1377 inline unsigned v_reduce_sad(
const v_uint16x16& a,
const v_uint16x16& b)
1380 v_expand(v_add_wrap(a - b, b - a), l, h);
1383 inline unsigned v_reduce_sad(
const v_int16x16& a,
const v_int16x16& b)
1386 v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
1389 inline unsigned v_reduce_sad(
const v_uint32x8& a,
const v_uint32x8& b)
1393 inline unsigned v_reduce_sad(
const v_int32x8& a,
const v_int32x8& b)
1395 v_int32x8 m = a < b;
1396 return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
1398 inline float v_reduce_sad(
const v_float32x8& a,
const v_float32x8& b)
1400 v_float32x8 a_b = a - b;
1401 return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
1405 inline v_uint8x32
v_popcount(
const v_uint8x32& a)
1406 {
return v_uint8x32(__lasx_xvpcnt_b(a.val)); }
1407 inline v_uint16x16
v_popcount(
const v_uint16x16& a)
1408 {
return v_uint16x16(__lasx_xvpcnt_h(a.val)); }
1409 inline v_uint32x8
v_popcount(
const v_uint32x8& a)
1410 {
return v_uint32x8(__lasx_xvpcnt_w(a.val)); }
1411 inline v_uint64x4
v_popcount(
const v_uint64x4& a)
1412 {
return v_uint64x4(__lasx_xvpcnt_d(a.val)); }
1413 inline v_uint8x32
v_popcount(
const v_int8x32& a)
1414 {
return v_popcount(v_reinterpret_as_u8(a)); }
1415 inline v_uint16x16
v_popcount(
const v_int16x16& a)
1416 {
return v_popcount(v_reinterpret_as_u16(a)); }
1417 inline v_uint32x8
v_popcount(
const v_int32x8& a)
1418 {
return v_popcount(v_reinterpret_as_u32(a)); }
1419 inline v_uint64x4
v_popcount(
const v_int64x4& a)
1420 {
return v_popcount(v_reinterpret_as_u64(a)); }
1424 __m256i
result = __lasx_xvmskltz_b(a.val);
1426 mask |= (__lasx_xvpickve2gr_w(
result, 4) << 16);
1430 {
return v_signmask(v_reinterpret_as_s8(a)); }
1433 {
return v_signmask(v_pack(a, a)) & 0xFFFF; }
1435 {
return v_signmask(v_reinterpret_as_s16(a)); }
1439 __m256i
result = __lasx_xvmskltz_w(a.val);
1441 mask |= (__lasx_xvpickve2gr_w(
result, 4) << 4);
1449 __m256i
result = __lasx_xvmskltz_d(a.val);
1451 mask |= (__lasx_xvpickve2gr_w(
result, 4) << 2);
1455 {
return v_signmask(v_reinterpret_as_s64(a)); }
1466 inline int v_scan_forward(
const v_uint16x16& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s8(a))) / 2; }
1469 inline int v_scan_forward(
const v_float32x8& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s8(a))) / 4; }
1472 inline int v_scan_forward(
const v_float64x4& a) {
return trailingZeros32(
v_signmask(v_reinterpret_as_s8(a))) / 8; }
1475 #define OPENCV_HAL_IMPL_LASX_CHECK(_Tpvec, allmask) \
1476 inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1477 inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1478 OPENCV_HAL_IMPL_LASX_CHECK(v_uint8x32, -1)
1479 OPENCV_HAL_IMPL_LASX_CHECK(v_int8x32, -1)
1480 OPENCV_HAL_IMPL_LASX_CHECK(v_uint32x8, 255)
1481 OPENCV_HAL_IMPL_LASX_CHECK(v_int32x8, 255)
1482 OPENCV_HAL_IMPL_LASX_CHECK(v_uint64x4, 15)
1483 OPENCV_HAL_IMPL_LASX_CHECK(v_int64x4, 15)
1484 OPENCV_HAL_IMPL_LASX_CHECK(v_float32x8, 255)
1485 OPENCV_HAL_IMPL_LASX_CHECK(v_float64x4, 15)
1487 #define OPENCV_HAL_IMPL_LASX_CHECK_SHORT(_Tpvec) \
1488 inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
1489 inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
1490 OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_uint16x16)
1491 OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
1496 #define OPENCV_HAL_IMPL_LASX_MULADD(_Tpvec, suffix) \
1497 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1498 { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); } \
1499 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1500 { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); } \
1501 inline _Tpvec v_sqrt(const _Tpvec& x) \
1502 { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); } \
1503 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1504 { return v_fma(a, a, b * b); } \
1505 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1506 { return v_sqrt(v_fma(a, a, b*b)); }
1508 OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
1509 OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
1511 inline v_int32x8
v_fma(
const v_int32x8& a,
const v_int32x8& b,
const v_int32x8& c)
1513 return v_int32x8(__lasx_xvmadd_w(c.val, a.val, b.val));
1516 inline v_int32x8
v_muladd(
const v_int32x8& a,
const v_int32x8& b,
const v_int32x8& c)
1518 return v_fma(a, b, c);
1521 inline v_float32x8
v_invsqrt(
const v_float32x8&
x)
1522 {
return v_float32x8(__lasx_xvfrsqrt_s(
x.val)); }
1524 inline v_float64x4
v_invsqrt(
const v_float64x4&
x)
1525 {
return v_float64x4(__lasx_xvfrsqrt_d(
x.val)); }
1528 #define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix) \
1529 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1530 { return v_u##_Tpvec(__lasx_xvabsd_##suffix(x.val, __lasx_xvreplgr2vr_w(0))); }
1532 OPENCV_HAL_IMPL_LASX_ABS(int8x32, b)
1533 OPENCV_HAL_IMPL_LASX_ABS(int16x16, h)
1534 OPENCV_HAL_IMPL_LASX_ABS(int32x8, w)
1536 inline v_float32x8 v_abs(
const v_float32x8&
x)
1537 {
return v_float32x8(*((__m256i*)&
x) & __lasx_xvreplgr2vr_w(0x7fffffff)); }
1538 inline v_float64x4 v_abs(
const v_float64x4&
x)
1539 {
return v_float64x4(*((__m256i*)&
x) & __lasx_xvreplgr2vr_d(0x7fffffffffffffff)); }
1542 inline v_uint8x32
v_absdiff(
const v_uint8x32& a,
const v_uint8x32& b)
1543 {
return (v_uint8x32)__lasx_xvabsd_bu(a.val, b.val); }
1544 inline v_uint16x16
v_absdiff(
const v_uint16x16& a,
const v_uint16x16& b)
1545 {
return (v_uint16x16)__lasx_xvabsd_hu(a.val, b.val); }
1546 inline v_uint32x8
v_absdiff(
const v_uint32x8& a,
const v_uint32x8& b)
1547 {
return (v_uint32x8)__lasx_xvabsd_wu(a.val, b.val); }
1549 inline v_uint8x32
v_absdiff(
const v_int8x32& a,
const v_int8x32& b)
1550 {
return (v_uint8x32)__lasx_xvabsd_b(a.val, b.val); }
1551 inline v_uint16x16
v_absdiff(
const v_int16x16& a,
const v_int16x16& b)
1552 {
return (v_uint16x16)__lasx_xvabsd_h(a.val, b.val); }
1553 inline v_uint32x8
v_absdiff(
const v_int32x8& a,
const v_int32x8& b)
1554 {
return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
1556 inline v_float32x8
v_absdiff(
const v_float32x8& a,
const v_float32x8& b)
1557 {
return v_abs(a - b); }
1559 inline v_float64x4
v_absdiff(
const v_float64x4& a,
const v_float64x4& b)
1560 {
return v_abs(a - b); }
1563 inline v_int8x32
v_absdiffs(
const v_int8x32& a,
const v_int8x32& b)
1565 v_int8x32 d = a - b;
1566 v_int8x32 m = a < b;
1569 inline v_int16x16
v_absdiffs(
const v_int16x16& a,
const v_int16x16& b)
1570 {
return v_max(a, b) - v_min(a, b); }
1575 inline v_int32x8
v_round(
const v_float32x8& a)
1576 {
return v_int32x8(__lasx_xvftint_w_s(a.val)); }
1578 inline v_int32x8
v_round(
const v_float64x4& a)
1579 { __m256i t = __lasx_xvftint_w_d(a.val, a.val);
1580 return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
1582 inline v_int32x8
v_round(
const v_float64x4& a,
const v_float64x4& b)
1584 __m256i abi = __lasx_xvftint_w_d(b.val, a.val);
1585 return v_int32x8(__lasx_xvpermi_d(abi, 0b11011000));
1588 inline v_int32x8
v_trunc(
const v_float32x8& a)
1589 {
return v_int32x8(__lasx_xvftintrz_w_s(a.val)); }
1591 inline v_int32x8
v_trunc(
const v_float64x4& a)
1592 { __m256i t = __lasx_xvftintrz_w_d(a.val, a.val);
1593 return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
1595 inline v_int32x8
v_floor(
const v_float32x8& a)
1596 {
return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrm_s(a.val)))); }
1598 inline v_int32x8
v_floor(
const v_float64x4& a)
1599 {
return v_trunc(v_float64x4(__lasx_xvfrintrm_d(a.val))); }
1601 inline v_int32x8
v_ceil(
const v_float32x8& a)
1602 {
return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrp_s(a.val)))); }
1604 inline v_int32x8
v_ceil(
const v_float64x4& a)
1605 {
return v_trunc(v_float64x4(__lasx_xvfrintrp_d(a.val))); }
1608 inline v_float32x8
v_cvt_f32(
const v_int32x8& a)
1609 {
return v_float32x8(__lasx_xvffint_s_w(a.val)); }
1611 inline v_float32x8
v_cvt_f32(
const v_float64x4& a)
1612 {
return v_float32x8(__lasx_xvpermi_d(__lasx_xvfcvt_s_d(a.val, a.val), 0x88)); }
1614 inline v_float32x8
v_cvt_f32(
const v_float64x4& a,
const v_float64x4& b)
1616 __m256 abf = __lasx_xvfcvt_s_d(a.val, b.val);
1617 return v_float32x8(__lasx_xvpermi_d(abf, 0x8D));
1620 inline v_float64x4
v_cvt_f64(
const v_int32x8& a)
1622 __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
1623 return v_float64x4(__lasx_xvffintl_d_w(alow));
1628 __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
1629 return v_float64x4(__lasx_xvffintl_d_w(ahigh));
1632 inline v_float64x4
v_cvt_f64(
const v_float32x8& a)
1634 __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
1635 return v_float64x4(__lasx_xvfcvtl_d_s((__m256)alow));
1640 __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
1641 return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh));
1644 inline v_float64x4
v_cvt_f64(
const v_int64x4& v)
1645 {
return v_float64x4(__lasx_xvffint_d_l(v.val)); }
1649 inline v_int8x32 v256_lut(
const schar* tab,
const int*
idx)
1651 return v_int8x32(_v256_setr_b(tab[
idx[ 0]], tab[
idx[ 1]], tab[
idx[ 2]], tab[
idx[ 3]], tab[
idx[ 4]], tab[
idx[ 5]],
1656 tab[
idx[30]], tab[
idx[31]]));
1658 inline v_int8x32 v256_lut_pairs(
const schar* tab,
const int*
idx)
1660 return v_int8x32(_v256_setr_h(*(
const short*)(tab +
idx[ 0]), *(
const short*)(tab +
idx[ 1]), *(
const short*)(tab +
idx[ 2]),
1661 *(
const short*)(tab +
idx[ 3]), *(
const short*)(tab +
idx[ 4]), *(
const short*)(tab +
idx[ 5]),
1662 *(
const short*)(tab +
idx[ 6]), *(
const short*)(tab +
idx[ 7]), *(
const short*)(tab +
idx[ 8]),
1663 *(
const short*)(tab +
idx[ 9]), *(
const short*)(tab +
idx[10]), *(
const short*)(tab +
idx[11]),
1664 *(
const short*)(tab +
idx[12]), *(
const short*)(tab +
idx[13]), *(
const short*)(tab +
idx[14]),
1665 *(
const short*)(tab +
idx[15])));
1667 inline v_int8x32 v256_lut_quads(
const schar* tab,
const int*
idx)
1669 return v_int8x32(_v256_setr_w(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
1670 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3]),
1671 *(
const int*)(tab +
idx[4]), *(
const int*)(tab +
idx[5]),
1672 *(
const int*)(tab +
idx[6]), *(
const int*)(tab +
idx[7])));
1674 inline v_uint8x32 v256_lut(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v256_lut((
const schar *)tab,
idx)); }
1675 inline v_uint8x32 v256_lut_pairs(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v256_lut_pairs((
const schar *)tab,
idx)); }
1676 inline v_uint8x32 v256_lut_quads(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v256_lut_quads((
const schar *)tab,
idx)); }
1678 inline v_int16x16 v256_lut(
const short* tab,
const int*
idx)
1680 return v_int16x16(_v256_setr_h(tab[
idx[ 0]], tab[
idx[ 1]], tab[
idx[ 2]], tab[
idx[ 3]], tab[
idx[ 4]],
1685 inline v_int16x16 v256_lut_pairs(
const short* tab,
const int*
idx)
1687 return v_int16x16(_v256_setr_w(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
1688 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3]),
1689 *(
const int*)(tab +
idx[4]), *(
const int*)(tab +
idx[5]),
1690 *(
const int*)(tab +
idx[6]), *(
const int*)(tab +
idx[7]) ));
1692 inline v_int16x16 v256_lut_quads(
const short* tab,
const int*
idx)
1694 return v_int16x16(_v256_setr_d(*(
const long long int*)(tab +
idx[0]), *(
const long long int*)(tab +
idx[1]),
1695 *(
const long long int*)(tab +
idx[2]), *(
const long long int*)(tab +
idx[3]) ));
1698 inline v_uint16x16 v256_lut(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v256_lut((
const short *)tab,
idx)); }
1699 inline v_uint16x16 v256_lut_pairs(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v256_lut_pairs((
const short *)tab,
idx)); }
1700 inline v_uint16x16 v256_lut_quads(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v256_lut_quads((
const short *)tab,
idx)); }
1702 inline v_int32x8 v256_lut(
const int* tab,
const int*
idx)
1704 return v_int32x8(_v256_setr_w(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
1705 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3]),
1706 *(
const int*)(tab +
idx[4]), *(
const int*)(tab +
idx[5]),
1707 *(
const int*)(tab +
idx[6]), *(
const int*)(tab +
idx[7]) ));
1709 inline v_int32x8 v256_lut_pairs(
const int* tab,
const int*
idx)
1711 return v_int32x8(_v256_setr_d(*(
const long long int*)(tab +
idx[0]), *(
const long long int*)(tab +
idx[1]),
1712 *(
const long long int*)(tab +
idx[2]), *(
const long long int*)(tab +
idx[3]) ));
1714 inline v_int32x8 v256_lut_quads(
const int* tab,
const int*
idx)
1716 return v_int32x8(_v256_combine(__lsx_vld(tab +
idx[0], 0), __lsx_vld(tab +
idx[1], 0)));
1718 inline v_uint32x8 v256_lut(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v256_lut((
const int *)tab,
idx)); }
1719 inline v_uint32x8 v256_lut_pairs(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v256_lut_pairs((
const int *)tab,
idx)); }
1720 inline v_uint32x8 v256_lut_quads(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v256_lut_quads((
const int *)tab,
idx)); }
1722 inline v_int64x4 v256_lut(
const int64* tab,
const int*
idx)
1724 return v_int64x4(_v256_setr_d(*(
const long long int*)(tab +
idx[0]), *(
const long long int*)(tab +
idx[1]),
1725 *(
const long long int*)(tab +
idx[2]), *(
const long long int*)(tab +
idx[3]) ));
1727 inline v_int64x4 v256_lut_pairs(
const int64* tab,
const int*
idx)
1729 return v_int64x4(_v256_combine(__lsx_vld(tab +
idx[0], 0), __lsx_vld(tab +
idx[1], 0)));
1731 inline v_uint64x4 v256_lut(
const uint64* tab,
const int*
idx) {
return v_reinterpret_as_u64(v256_lut((
const int64 *)tab,
idx)); }
1732 inline v_uint64x4 v256_lut_pairs(
const uint64* tab,
const int*
idx) {
return v_reinterpret_as_u64(v256_lut_pairs((
const int64 *)tab,
idx)); }
1734 inline v_float32x8 v256_lut(
const float* tab,
const int*
idx)
1736 return v_float32x8(_v256_setr_ps(tab[
idx[0]], tab[
idx[1]], tab[
idx[2]], tab[
idx[3]],
1739 inline v_float32x8 v256_lut_pairs(
const float* tab,
const int*
idx) {
return v_reinterpret_as_f32(v256_lut_pairs((
const int *)tab,
idx)); }
1740 inline v_float32x8 v256_lut_quads(
const float* tab,
const int*
idx) {
return v_reinterpret_as_f32(v256_lut_quads((
const int *)tab,
idx)); }
1742 inline v_float64x4 v256_lut(
const double* tab,
const int*
idx)
1744 return v_float64x4(_v256_setr_pd(tab[
idx[0]], tab[
idx[1]], tab[
idx[2]], tab[
idx[3]]));
1746 inline v_float64x4 v256_lut_pairs(
const double* tab,
const int*
idx)
1747 {
return v_float64x4(_v256_combine(__lsx_vld(tab +
idx[0], 0), __lsx_vld(tab +
idx[1], 0))); }
1749 inline v_int32x8
v_lut(
const int* tab,
const v_int32x8& idxvec)
1751 int *
idx = (
int*)&idxvec.val;
1752 return v256_lut(tab,
idx);
1755 inline v_uint32x8
v_lut(
const unsigned* tab,
const v_int32x8& idxvec)
1757 return v_reinterpret_as_u32(
v_lut((
const int *)tab, idxvec));
1760 inline v_float32x8
v_lut(
const float* tab,
const v_int32x8& idxvec)
1762 const int *
idx = (
const int*)&idxvec.val;
1763 return v256_lut(tab,
idx);
1766 inline v_float64x4
v_lut(
const double* tab,
const v_int32x8& idxvec)
1768 const int *
idx = (
const int*)&idxvec.val;
1769 return v256_lut(tab,
idx);
1772 inline void v_lut_deinterleave(
const float* tab,
const v_int32x8& idxvec, v_float32x8&
x, v_float32x8&
y)
1774 const int *
idx = (
const int*)&idxvec.val;
1775 __m128i xy01, xy45, xy23, xy67;
1776 xy01 = __lsx_vld(tab +
idx[0], 0);
1777 xy01 = __lsx_vextrins_d(xy01, __lsx_vld(tab +
idx[1], 0), 0x10);
1778 xy45 = __lsx_vld(tab +
idx[4], 0);
1779 xy45 = __lsx_vextrins_d(xy45, __lsx_vld(tab +
idx[5], 0), 0x10);
1780 __m256i xy0145 = _v256_combine(xy01, xy45);
1781 xy23 = __lsx_vld(tab +
idx[2], 0);
1782 xy23 = __lsx_vextrins_d(xy23, __lsx_vld(tab +
idx[3], 0), 0x10);
1783 xy67 = __lsx_vld(tab +
idx[6], 0);
1784 xy67 = __lsx_vextrins_d(xy67, __lsx_vld(tab +
idx[7], 0), 0x10);
1785 __m256i xy2367 = _v256_combine(xy23, xy67);
1787 __m256i xxyy0145 = __lasx_xvilvl_w(xy2367, xy0145);
1788 __m256i xxyy2367 = __lasx_xvilvh_w(xy2367, xy0145);
1790 x = v_float32x8(__lasx_xvilvl_w(xxyy2367, xxyy0145));
1791 y = v_float32x8(__lasx_xvilvh_w(xxyy2367, xxyy0145));
1794 inline void v_lut_deinterleave(
const double* tab,
const v_int32x8& idxvec, v_float64x4&
x, v_float64x4&
y)
1797 const int *
idx = (
const int*)&idxvec.val;
1798 __m128i xy0 = __lsx_vld(tab +
idx[0], 0);
1799 __m128i xy2 = __lsx_vld(tab +
idx[2], 0);
1800 __m128i xy1 = __lsx_vld(tab +
idx[1], 0);
1801 __m128i xy3 = __lsx_vld(tab +
idx[3], 0);
1802 __m256i xy02 = _v256_combine(xy0, xy2);
1803 __m256i xy13 = _v256_combine(xy1, xy3);
1805 x = v_float64x4(__lasx_xvilvl_d(xy13, xy02));
1806 y = v_float64x4(__lasx_xvilvh_d(xy13, xy02));
1811 return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
1812 _v256_set_d(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
1818 return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
1819 _v256_set_d(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
1826 return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
1827 _v256_set_d(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
1833 return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
1834 _v256_set_d(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
1841 return v_int32x8(__lasx_xvshuf4i_w(vec.val, 0xd8));
1850 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1851 __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
1852 _v256_set_d(0x1211100f0e0d0c0a, 0x0908060504020100, 0x1211100f0e0d0c0a, 0x0908060504020100));
1853 return v_int8x32(__lasx_xvperm_w(t1,
1854 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1857 {
return v_reinterpret_as_u8(
v_pack_triplets(v_reinterpret_as_s8(vec))); }
1861 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1862 __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
1863 _v256_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100, 0x11100f0e0d0c0b0a, 0x0908050403020100));
1864 return v_int16x16(__lasx_xvperm_w(t1,
1865 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1868 {
return v_reinterpret_as_u16(
v_pack_triplets(v_reinterpret_as_s16(vec))); }
1872 return v_int32x8(__lasx_xvperm_w(vec.val,
1873 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1876 {
return v_reinterpret_as_u32(
v_pack_triplets(v_reinterpret_as_s32(vec))); }
1879 return v_float32x8(__lasx_xvperm_w(*(__m256i*)(&vec.val),
1880 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1888 inline v_int32x8
v_dotprod(
const v_int16x16& a,
const v_int16x16& b)
1889 {
return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
1891 inline v_int32x8
v_dotprod(
const v_int16x16& a,
const v_int16x16& b,
const v_int32x8& c)
1895 inline v_int64x4
v_dotprod(
const v_int32x8& a,
const v_int32x8& b)
1897 __m256i even = __lasx_xvmulwev_d_w(a.val, b.val);
1898 return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
1900 inline v_int64x4
v_dotprod(
const v_int32x8& a,
const v_int32x8& b,
const v_int64x4& c)
1902 __m256i even = __lasx_xvmaddwev_d_w(c.val, a.val, b.val);
1903 return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
1907 inline v_uint32x8
v_dotprod_expand(
const v_uint8x32& a,
const v_uint8x32& b)
1909 __m256i even = __lasx_xvmulwev_h_bu(a.val, b.val);
1910 __m256i odd = __lasx_xvmulwod_h_bu(a.val, b.val);
1911 __m256i prod0 = __lasx_xvhaddw_wu_hu(even, even);
1912 __m256i prod1 = __lasx_xvhaddw_wu_hu(odd, odd);
1913 return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
1915 inline v_uint32x8
v_dotprod_expand(
const v_uint8x32& a,
const v_uint8x32& b,
const v_uint32x8& c)
1920 __m256i even = __lasx_xvmulwev_h_b(a.val, b.val);
1921 __m256i odd = __lasx_xvmulwod_h_b(a.val, b.val);
1922 __m256i prod0 = __lasx_xvhaddw_w_h(even, even);
1923 __m256i prod1 = __lasx_xvhaddw_w_h(odd, odd);
1924 return v_int32x8(__lasx_xvadd_w(prod0, prod1));
1926 inline v_int32x8
v_dotprod_expand(
const v_int8x32& a,
const v_int8x32& b,
const v_int32x8& c)
1930 inline v_uint64x4
v_dotprod_expand(
const v_uint16x16& a,
const v_uint16x16& b)
1932 __m256i even = __lasx_xvmulwev_w_hu(a.val, b.val);
1933 __m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val);
1934 __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
1935 __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
1936 return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
1938 inline v_uint64x4
v_dotprod_expand(
const v_uint16x16& a,
const v_uint16x16& b,
const v_uint64x4& c)
1941 inline v_int64x4
v_dotprod_expand(
const v_int16x16& a,
const v_int16x16& b)
1943 __m256i even = __lasx_xvmulwev_w_h(a.val, b.val);
1944 __m256i odd = __lasx_xvmulwod_w_h(a.val, b.val);
1945 __m256i prod0 = __lasx_xvhaddw_d_w(even, even);
1946 __m256i prod1 = __lasx_xvhaddw_d_w(odd, odd);
1947 return v_int64x4(__lasx_xvadd_d(prod0, prod1));
1950 inline v_int64x4
v_dotprod_expand(
const v_int16x16& a,
const v_int16x16& b,
const v_int64x4& c)
1954 inline v_float64x4
v_dotprod_expand(
const v_int32x8& a,
const v_int32x8& b)
1956 inline v_float64x4
v_dotprod_expand(
const v_int32x8& a,
const v_int32x8& b,
const v_float64x4& c)
1962 inline v_int32x8
v_dotprod_fast(
const v_int16x16& a,
const v_int16x16& b)
1964 inline v_int32x8
v_dotprod_fast(
const v_int16x16& a,
const v_int16x16& b,
const v_int32x8& c)
1968 inline v_int64x4
v_dotprod_fast(
const v_int32x8& a,
const v_int32x8& b)
1970 inline v_int64x4
v_dotprod_fast(
const v_int32x8& a,
const v_int32x8& b,
const v_int64x4& c)
1976 inline v_uint32x8
v_dotprod_expand_fast(
const v_uint8x32& a,
const v_uint8x32& b,
const v_uint32x8& c)
1987 __m256i even = __lasx_xvmulwev_w_hu(a.val, b.val);
1988 __m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val);
1989 __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
1990 __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
1991 return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
1993 inline v_uint64x4
v_dotprod_expand_fast(
const v_uint16x16& a,
const v_uint16x16& b,
const v_uint64x4& c)
1998 __m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
1999 __m256i sign = __lasx_xvsrai_w(prod, 31);
2000 __m256i lo = __lasx_xvilvl_w(sign, prod);
2001 __m256i hi = __lasx_xvilvh_w(sign, prod);
2002 return v_int64x4(__lasx_xvadd_d(lo, hi));
2004 inline v_int64x4
v_dotprod_expand_fast(
const v_int16x16& a,
const v_int16x16& b,
const v_int64x4& c)
2010 inline v_float64x4
v_dotprod_expand_fast(
const v_int32x8& a,
const v_int32x8& b,
const v_float64x4& c)
2014 #define OPENCV_HAL_LASX_SPLAT2_PS(a, im) \
2015 v_float32x8(__lasx_xvpermi_w(a.val, a.val, im))
2017 inline v_float32x8
v_matmul(
const v_float32x8& v,
const v_float32x8& m0,
2018 const v_float32x8& m1,
const v_float32x8& m2,
2019 const v_float32x8& m3)
2021 v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
2022 v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
2023 v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
2024 v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
2028 inline v_float32x8
v_matmuladd(
const v_float32x8& v,
const v_float32x8& m0,
2029 const v_float32x8& m1,
const v_float32x8& m2,
2030 const v_float32x8& a)
2032 v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
2033 v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
2034 v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
2039 #define OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(_Tpvec, cast_from, cast_to) \
2040 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2041 const _Tpvec& a2, const _Tpvec& a3, \
2042 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
2044 __m256i t0 = cast_from(__lasx_xvilvl_w(a1.val, a0.val)); \
2045 __m256i t1 = cast_from(__lasx_xvilvl_w(a3.val, a2.val)); \
2046 __m256i t2 = cast_from(__lasx_xvilvh_w(a1.val, a0.val)); \
2047 __m256i t3 = cast_from(__lasx_xvilvh_w(a3.val, a2.val)); \
2048 b0.val = cast_to(__lasx_xvilvl_d(t1, t0)); \
2049 b1.val = cast_to(__lasx_xvilvh_d(t1, t0)); \
2050 b2.val = cast_to(__lasx_xvilvl_d(t3, t2)); \
2051 b3.val = cast_to(__lasx_xvilvh_d(t3, t2)); \
2054 OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_uint32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2055 OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_int32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2057 inline void v_transpose4x4(
const v_float32x8 &a0,
const v_float32x8 &a1,
2058 const v_float32x8 &a2,
const v_float32x8 &a3,
2059 v_float32x8 &b0, v_float32x8 &b1, v_float32x8 &b2, v_float32x8 &b3)
2061 __m256i t0 = __lasx_xvilvl_w(__m256i(a1.val), __m256i(a0.val));
2062 __m256i t1 = __lasx_xvilvl_w(__m256i(a3.val), __m256i(a2.val));
2063 __m256i t2 = __lasx_xvilvh_w(__m256i(a1.val), __m256i(a0.val));
2064 __m256i t3 = __lasx_xvilvh_w(__m256i(a3.val), __m256i(a2.val));
2065 b0.val = __m256(__lasx_xvilvl_d(t1, t0));
2066 b1.val = __m256(__lasx_xvilvh_d(t1, t0));
2067 b2.val = __m256(__lasx_xvilvl_d(t3, t2));
2068 b3.val = __m256(__lasx_xvilvh_d(t3, t2));
2074 #define OPENCV_HAL_IMPL_LASX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
2075 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2077 b0.val = intrin(a.val); \
2078 b1.val = intrin(__lasx_xvpermi_q(a.val, a.val, 0x11)); \
2080 inline _Tpwvec v_expand_low(const _Tpvec& a) \
2081 { return _Tpwvec(intrin(a.val)); } \
2082 inline _Tpwvec v_expand_high(const _Tpvec& a) \
2083 { return _Tpwvec(intrin(__lasx_xvpermi_q(a.val, a.val, 0x11))); } \
2084 inline _Tpwvec v256_load_expand(const _Tp* ptr) \
2086 __m128i a = __lsx_vld(ptr, 0); \
2087 return _Tpwvec(intrin(*((__m256i*)&a))); \
2090 OPENCV_HAL_IMPL_LASX_EXPAND(v_uint8x32, v_uint16x16,
uchar, __lasx_vext2xv_hu_bu)
2091 OPENCV_HAL_IMPL_LASX_EXPAND(v_int8x32, v_int16x16,
schar, __lasx_vext2xv_h_b)
2092 OPENCV_HAL_IMPL_LASX_EXPAND(v_uint16x16, v_uint32x8,
ushort, __lasx_vext2xv_wu_hu)
2093 OPENCV_HAL_IMPL_LASX_EXPAND(v_int16x16, v_int32x8,
short, __lasx_vext2xv_w_h)
2094 OPENCV_HAL_IMPL_LASX_EXPAND(v_uint32x8, v_uint64x4,
unsigned, __lasx_vext2xv_du_wu)
2095 OPENCV_HAL_IMPL_LASX_EXPAND(v_int32x8, v_int64x4,
int, __lasx_vext2xv_d_w)
2097 #define OPENCV_HAL_IMPL_LASX_EXPAND_Q(_Tpvec, _Tp, intrin) \
2098 inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
2100 __m128i a = __lsx_vld(ptr, 0); \
2101 return _Tpvec(intrin(*((__m256i*)&a))); \
2104 OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_uint32x8,
uchar, __lasx_vext2xv_wu_bu)
2105 OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_int32x8,
schar, __lasx_vext2xv_w_b)
2109 inline v_int8x32 v_pack(
const v_int16x16& a,
const v_int16x16& b)
2110 {
return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); }
2112 inline v_uint8x32 v_pack(
const v_uint16x16& a,
const v_uint16x16& b)
2113 {
return v_uint8x32(_v256_shuffle_odd_64(__lasx_xvssrlrni_bu_h(b.val, a.val, 0))); }
2115 inline v_uint8x32 v_pack_u(
const v_int16x16& a,
const v_int16x16& b)
2117 return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a.val, b.val)));
2126 inline void v_pack_u_store(
uchar* ptr,
const v_int16x16& a)
2129 template<
int n>
inline
2130 v_uint8x32 v_rshr_pack(
const v_uint16x16& a,
const v_uint16x16& b)
2132 __m256i res = __lasx_xvssrlrni_bu_h(b.val, a.val, n);
2133 return v_uint8x32(_v256_shuffle_odd_64(res));
2136 template<
int n>
inline
2137 void v_rshr_pack_store(
uchar* ptr,
const v_uint16x16& a)
2139 __m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
2140 __lasx_xvstelm_d(res, ptr, 0, 0);
2141 __lasx_xvstelm_d(res, ptr, 8, 2);
2144 template<
int n>
inline
2145 v_uint8x32 v_rshr_pack_u(
const v_int16x16& a,
const v_int16x16& b)
2147 __m256i res = __lasx_xvssrarni_bu_h(b.val, a.val, n);
2148 return v_uint8x32(_v256_shuffle_odd_64(res));
2151 template<
int n>
inline
2152 void v_rshr_pack_u_store(
uchar* ptr,
const v_int16x16& a)
2154 __m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
2155 __lasx_xvstelm_d(res, ptr, 0, 0);
2156 __lasx_xvstelm_d(res, ptr, 8, 2);
2159 template<
int n>
inline
2160 v_int8x32 v_rshr_pack(
const v_int16x16& a,
const v_int16x16& b)
2162 __m256i res = __lasx_xvssrarni_b_h(b.val, a.val, n);
2163 return v_int8x32(_v256_shuffle_odd_64(res));
2166 template<
int n>
inline
2167 void v_rshr_pack_store(
schar* ptr,
const v_int16x16& a)
2169 __m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
2170 __lasx_xvstelm_d(res, ptr, 0, 0);
2171 __lasx_xvstelm_d(res, ptr, 8, 2);
2175 inline v_int16x16 v_pack(
const v_int32x8& a,
const v_int32x8& b)
2176 {
return v_int16x16(_v256_shuffle_odd_64(_lasx_packs_w(a.val, b.val))); }
2178 inline v_uint16x16 v_pack(
const v_uint32x8& a,
const v_uint32x8& b)
2179 {
return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
2181 inline v_uint16x16 v_pack_u(
const v_int32x8& a,
const v_int32x8& b)
2182 {
return v_uint16x16(_v256_shuffle_odd_64(_lasx_packus_w(a.val, b.val))); }
2184 inline void v_pack_store(
short* ptr,
const v_int32x8& a)
2189 __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
2190 __lasx_xvstelm_d(res, ptr, 0, 0);
2191 __lasx_xvstelm_d(res, ptr, 8, 2);
2194 inline void v_pack_u_store(
ushort* ptr,
const v_int32x8& a)
2197 template<
int n>
inline
2198 v_uint16x16 v_rshr_pack(
const v_uint32x8& a,
const v_uint32x8& b)
2199 {
return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrlrni_hu_w(b.val, a.val, n))); }
2201 template<
int n>
inline
2202 void v_rshr_pack_store(
ushort* ptr,
const v_uint32x8& a)
2204 __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
2205 __lasx_xvstelm_d(res, ptr, 0, 0);
2206 __lasx_xvstelm_d(res, ptr, 8, 2);
2209 template<
int n>
inline
2210 v_uint16x16 v_rshr_pack_u(
const v_int32x8& a,
const v_int32x8& b)
2211 {
return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_hu_w(b.val, a.val, n))); }
2213 template<
int n>
inline
2214 void v_rshr_pack_u_store(
ushort* ptr,
const v_int32x8& a)
2216 __m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
2217 __lasx_xvstelm_d(res, ptr, 0, 0);
2218 __lasx_xvstelm_d(res, ptr, 8, 2);
2221 template<
int n>
inline
2222 v_int16x16 v_rshr_pack(
const v_int32x8& a,
const v_int32x8& b)
2223 {
return v_int16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_h_w(b.val, a.val, n))); }
2225 template<
int n>
inline
2226 void v_rshr_pack_store(
short* ptr,
const v_int32x8& a)
2228 __m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
2229 __lasx_xvstelm_d(res, ptr, 0, 0);
2230 __lasx_xvstelm_d(res, ptr, 8, 2);
2235 inline v_uint32x8 v_pack(
const v_uint64x4& a,
const v_uint64x4& b)
2237 __m256i ab = __lasx_xvpickev_w(b.val, a.val);
2238 return v_uint32x8(_v256_shuffle_odd_64(ab));
2241 inline v_int32x8 v_pack(
const v_int64x4& a,
const v_int64x4& b)
2242 {
return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2244 inline void v_pack_store(
unsigned* ptr,
const v_uint64x4& a)
2246 __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
2247 v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
2251 {
v_pack_store((
unsigned*)ptr, v_reinterpret_as_u64(b)); }
2253 template<
int n>
inline
2254 v_uint32x8 v_rshr_pack(
const v_uint64x4& a,
const v_uint64x4& b)
2255 {
return v_uint32x8(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(b.val, a.val, n))); }
2257 template<
int n>
inline
2258 void v_rshr_pack_store(
unsigned* ptr,
const v_uint64x4& a)
2260 __m256i res = __lasx_xvsrlrni_w_d(a.val, a.val, n);
2261 __lasx_xvstelm_d(res, ptr, 0, 0);
2262 __lasx_xvstelm_d(res, ptr, 8, 2);
2265 template<
int n>
inline
2266 v_int32x8 v_rshr_pack(
const v_int64x4& a,
const v_int64x4& b)
2267 {
return v_int32x8(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(b.val, a.val, n))); }
2269 template<
int n>
inline
2270 void v_rshr_pack_store(
int* ptr,
const v_int64x4& a)
2272 __m256i res = __lasx_xvsrarni_w_d(a.val, a.val, n);
2273 __lasx_xvstelm_d(res, ptr, 0, 0);
2274 __lasx_xvstelm_d(res, ptr, 8, 2);
2278 inline v_uint8x32
v_pack_b(
const v_uint16x16& a,
const v_uint16x16& b)
2280 __m256i ab = _lasx_packs_h(a.val, b.val);
2281 return v_uint8x32(_v256_shuffle_odd_64(ab));
2284 inline v_uint8x32
v_pack_b(
const v_uint32x8& a,
const v_uint32x8& b,
2285 const v_uint32x8& c,
const v_uint32x8& d)
2287 __m256i ab = _lasx_packs_w(a.val, b.val);
2288 __m256i cd = _lasx_packs_w(c.val, d.val);
2290 __m256i abcd = _v256_shuffle_odd_64(_lasx_packs_h(ab, cd));
2291 return v_uint8x32(__lasx_xvshuf4i_w(abcd, 0xd8));
2294 inline v_uint8x32
v_pack_b(
const v_uint64x4& a,
const v_uint64x4& b,
const v_uint64x4& c,
2295 const v_uint64x4& d,
const v_uint64x4& e,
const v_uint64x4& f,
2296 const v_uint64x4& g,
const v_uint64x4& h)
2298 __m256i ab = _lasx_packs_w(a.val, b.val);
2299 __m256i cd = _lasx_packs_w(c.val, d.val);
2300 __m256i ef = _lasx_packs_w(e.val, f.val);
2301 __m256i gh = _lasx_packs_w(g.val, h.val);
2303 __m256i abcd = _lasx_packs_w(ab, cd);
2304 __m256i efgh = _lasx_packs_w(ef, gh);
2305 __m256i pkall = _v256_shuffle_odd_64(_lasx_packs_h(abcd, efgh));
2307 __m256i rev = _v256_alignr_b(pkall, pkall, 8);
2308 return v_uint8x32(__lasx_xvilvl_h(rev, pkall));
2315 #define OPENCV_HAL_IMPL_LASX_EXTRACT(_Tpvec) \
2317 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2318 { return v_rotate_right<s>(a, b); }
2320 OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint8x32)
2321 OPENCV_HAL_IMPL_LASX_EXTRACT(v_int8x32)
2322 OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint16x16)
2323 OPENCV_HAL_IMPL_LASX_EXTRACT(v_int16x16)
2324 OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint32x8)
2325 OPENCV_HAL_IMPL_LASX_EXTRACT(v_int32x8)
2326 OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint64x4)
2327 OPENCV_HAL_IMPL_LASX_EXTRACT(v_int64x4)
2328 OPENCV_HAL_IMPL_LASX_EXTRACT(v_float32x8)
2329 OPENCV_HAL_IMPL_LASX_EXTRACT(v_float64x4)
2334 return (
uchar)_v256_extract_b<i>(a.val);
2340 return (
schar)v_extract_n<i>(v_reinterpret_as_u8(a));
2346 return (
ushort)_v256_extract_h<i>(a.val);
2352 return (
short)v_extract_n<i>(v_reinterpret_as_u16(a));
2358 return (
uint)_v256_extract_w<i>(a.val);
2364 return (
int)v_extract_n<i>(v_reinterpret_as_u32(a));
2370 return (
uint64)_v256_extract_d<i>(a.val);
2376 return (
int64)v_extract_n<i>(v_reinterpret_as_u64(v));
2382 union {
uint iv;
float fv; } d;
2383 d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
2390 union {
uint64 iv;
double dv; } d;
2391 d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
2398 static const __m256i perm = __lasx_xvreplgr2vr_w((
char)i);
2399 return v_uint32x8(__lasx_xvperm_w(a.val, perm));
2404 {
return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2408 {
return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2414 __m256i t0 = __lasx_xvld(ptr, 0);
2415 __m256i t1 = __lasx_xvld(ptr, 32);
2417 __m256i p0 = __lasx_xvpickev_b(t1, t0);
2418 __m256i p1 = __lasx_xvpickod_b(t1, t0);
2420 a.val = __lasx_xvpermi_d(p0, 0xd8);
2421 b.val = __lasx_xvpermi_d(p1, 0xd8);
2426 __m256i t0 = __lasx_xvld(ptr, 0);
2427 __m256i t1 = __lasx_xvld(ptr, 32);
2429 __m256i p0 = __lasx_xvpickev_h(t1, t0);
2430 __m256i p1 = __lasx_xvpickod_h(t1, t0);
2432 a.val = __lasx_xvpermi_d(p0, 0xd8);
2433 b.val = __lasx_xvpermi_d(p1, 0xd8);
2438 __m256i t0 = __lasx_xvld(ptr, 0);
2439 __m256i t1 = __lasx_xvld(ptr, 32);
2441 __m256i p0 = __lasx_xvpickev_w(t1, t0);
2442 __m256i p1 = __lasx_xvpickod_w(t1, t0);
2444 a.val = __lasx_xvpermi_d(p0, 0xd8);
2445 b.val = __lasx_xvpermi_d(p1, 0xd8);
2450 __m256i ab0 = __lasx_xvld(ptr, 0);
2451 __m256i ab1 = __lasx_xvld(ptr, 32);
2453 __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02);
2454 __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13);
2455 __m256i a0 = __lasx_xvilvl_d(ph, pl);
2456 __m256i b0 = __lasx_xvilvh_d(ph, pl);
2463 __m256i bgr0 = __lasx_xvld(ptr, 0);
2464 __m256i bgr1 = __lasx_xvld(ptr, 32);
2465 __m256i bgr2 = __lasx_xvld(ptr, 64);
2467 __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2468 __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2470 const __m256i m0 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2471 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2472 const __m256i m1 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2473 -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
2475 __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
2476 __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
2477 __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
2480 sh_b = _v256_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
2481 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
2482 sh_g = _v256_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
2483 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
2484 sh_r = _v256_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
2485 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2486 b0 = __lasx_xvshuf_b(b0, b0, sh_b);
2487 g0 = __lasx_xvshuf_b(g0, g0, sh_g);
2488 r0 = __lasx_xvshuf_b(r0, r0, sh_r);
2497 __m256i bgr0 = __lasx_xvld(ptr, 0);
2498 __m256i bgr1 = __lasx_xvld(ptr, 32);
2499 __m256i bgr2 = __lasx_xvld(ptr, 64);
2501 __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2502 __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2504 const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2505 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2506 const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2507 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2508 __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
2509 __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
2510 __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
2511 const __m256i sh_b = _v256_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2512 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2513 const __m256i sh_g = _v256_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2514 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2515 const __m256i sh_r = _v256_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2516 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2517 b0 = __lasx_xvshuf_b(b0, b0, sh_b);
2518 g0 = __lasx_xvshuf_b(g0, g0, sh_g);
2519 r0 = __lasx_xvshuf_b(r0, r0, sh_r);
2521 a = v_uint16x16(b0);
2522 b = v_uint16x16(g0);
2523 c = v_uint16x16(r0);
2526 inline void v_load_deinterleave(
const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
2528 __m256i bgr0 = __lasx_xvld(ptr, 0);
2529 __m256i bgr1 = __lasx_xvld(ptr, 32);
2530 __m256i bgr2 = __lasx_xvld(ptr, 64);
2532 __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2533 __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2535 __m256i m24 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
2536 __m256i m92 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
2537 __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m24), bgr1, m92);
2538 __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m92), bgr1, m24);
2539 __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m24), s02_high, m92);
2541 b0 = __lasx_xvshuf4i_w(b0, 0x6c);
2542 g0 = __lasx_xvshuf4i_w(g0, 0xb1);
2543 r0 = __lasx_xvshuf4i_w(r0, 0xc6);
2552 __m256i bgr0 = __lasx_xvld(ptr, 0);
2553 __m256i bgr1 = __lasx_xvld(ptr, 32);
2554 __m256i bgr2 = __lasx_xvld(ptr, 64);
2556 __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12);
2557 __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12);
2558 __m256i s20r = __lasx_xvpermi_d(__lasx_xvpermi_q(bgr2, bgr0, 0x12), 0x1b);
2559 __m256i b0 = __lasx_xvilvl_d(s20r, s01);
2560 __m256i g0 = _v256_alignr_b(s12, s01, 8);
2561 __m256i r0 = __lasx_xvilvh_d(s12, s20r);
2570 __m256i t0 = __lasx_xvld(ptr, 0);
2571 __m256i t1 = __lasx_xvld(ptr, 32);
2572 __m256i t2 = __lasx_xvld(ptr, 64);
2573 __m256i t3 = __lasx_xvld(ptr, 96);
2575 const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
2576 __m256i ac_lo = __lasx_xvpickev_b(t1, t0);
2577 __m256i bd_lo = __lasx_xvpickod_b(t1, t0);
2578 __m256i ac_hi = __lasx_xvpickev_b(t3, t2);
2579 __m256i bd_hi = __lasx_xvpickod_b(t3, t2);
2581 __m256i a_pre = __lasx_xvpickev_b(ac_hi, ac_lo);
2582 __m256i c_pre = __lasx_xvpickod_b(ac_hi, ac_lo);
2583 __m256i b_pre = __lasx_xvpickev_b(bd_hi, bd_lo);
2584 __m256i d_pre = __lasx_xvpickod_b(bd_hi, bd_lo);
2586 a.val = __lasx_xvperm_w(a_pre, sh);
2587 b.val = __lasx_xvperm_w(b_pre, sh);
2588 c.val = __lasx_xvperm_w(c_pre, sh);
2589 d.val = __lasx_xvperm_w(d_pre, sh);
2594 __m256i t0 = __lasx_xvld(ptr, 0);
2595 __m256i t1 = __lasx_xvld(ptr, 32);
2596 __m256i t2 = __lasx_xvld(ptr, 64);
2597 __m256i t3 = __lasx_xvld(ptr, 96);
2599 const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
2600 __m256i ac_lo = __lasx_xvpickev_h(t1, t0);
2601 __m256i bd_lo = __lasx_xvpickod_h(t1, t0);
2602 __m256i ac_hi = __lasx_xvpickev_h(t3, t2);
2603 __m256i bd_hi = __lasx_xvpickod_h(t3, t2);
2605 __m256i a_pre = __lasx_xvpickev_h(ac_hi, ac_lo);
2606 __m256i c_pre = __lasx_xvpickod_h(ac_hi, ac_lo);
2607 __m256i b_pre = __lasx_xvpickev_h(bd_hi, bd_lo);
2608 __m256i d_pre = __lasx_xvpickod_h(bd_hi, bd_lo);
2610 a.val = __lasx_xvperm_w(a_pre, sh);
2611 b.val = __lasx_xvperm_w(b_pre, sh);
2612 c.val = __lasx_xvperm_w(c_pre, sh);
2613 d.val = __lasx_xvperm_w(d_pre, sh);
2616 inline void v_load_deinterleave(
const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
2618 __m256i p0 = __lasx_xvld(ptr, 0);
2619 __m256i p1 = __lasx_xvld(ptr, 32);
2620 __m256i p2 = __lasx_xvld(ptr, 64);
2621 __m256i p3 = __lasx_xvld(ptr, 96);
2623 __m256i p01l = __lasx_xvilvl_w(p1, p0);
2624 __m256i p01h = __lasx_xvilvh_w(p1, p0);
2625 __m256i p23l = __lasx_xvilvl_w(p3, p2);
2626 __m256i p23h = __lasx_xvilvh_w(p3, p2);
2628 __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
2629 __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
2630 __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
2631 __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
2633 __m256i b0 = __lasx_xvilvl_w(plh, pll);
2634 __m256i g0 = __lasx_xvilvh_w(plh, pll);
2635 __m256i r0 = __lasx_xvilvl_w(phh, phl);
2636 __m256i a0 = __lasx_xvilvh_w(phh, phl);
2646 __m256i bgra0 = __lasx_xvld(ptr, 0);
2647 __m256i bgra1 = __lasx_xvld(ptr, 32);
2648 __m256i bgra2 = __lasx_xvld(ptr, 64);
2649 __m256i bgra3 = __lasx_xvld(ptr, 96);
2651 __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02);
2652 __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13);
2653 __m256i l13 = __lasx_xvpermi_q(bgra1, bgra3, 0x02);
2654 __m256i h13 = __lasx_xvpermi_q(bgra1, bgra3, 0x13);
2656 __m256i b0 = __lasx_xvilvl_d(l13, l02);
2657 __m256i g0 = __lasx_xvilvh_d(l13, l02);
2658 __m256i r0 = __lasx_xvilvl_d(h13, h02);
2659 __m256i a0 = __lasx_xvilvh_d(h13, h02);
2672 __m256i xy_l = __lasx_xvilvl_b(
y.val,
x.val);
2673 __m256i xy_h = __lasx_xvilvh_b(
y.val,
x.val);
2675 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2676 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2678 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2679 __lasx_xvst(xy1, (__m256i*)ptr, 32*1);
2685 __m256i xy_l = __lasx_xvilvl_h(
y.val,
x.val);
2686 __m256i xy_h = __lasx_xvilvh_h(
y.val,
x.val);
2688 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2689 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2691 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2692 __lasx_xvst(xy1, (__m256i*)ptr, 16*2);
2698 __m256i xy_l = __lasx_xvilvl_w(
y.val,
x.val);
2699 __m256i xy_h = __lasx_xvilvh_w(
y.val,
x.val);
2701 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2702 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2704 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2705 __lasx_xvst(xy1, (__m256i*)ptr, 8*4);
2711 __m256i xy_l = __lasx_xvilvl_d(
y.val,
x.val);
2712 __m256i xy_h = __lasx_xvilvh_d(
y.val,
x.val);
2714 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2715 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2717 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2718 __lasx_xvst(xy1, (__m256i*)ptr, 4*8);
2724 const __m256i sh_b = _v256_setr_b(
2725 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2726 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2727 const __m256i sh_g = _v256_setr_b(
2728 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2729 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2730 const __m256i sh_r = _v256_setr_b(
2731 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2732 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2734 __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
2735 __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
2736 __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
2738 const __m256i m0 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2739 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2740 const __m256i m1 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2741 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2743 __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
2744 __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
2745 __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
2747 __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
2748 __m256i bgr1 = __lasx_xvpermi_q(p0, p2, 0 + 3*16);
2749 __m256i bgr2 = __lasx_xvpermi_q(p2, p1, 1 + 3*16);
2751 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2752 __lasx_xvst(bgr1, (__m256i*)ptr, 32);
2753 __lasx_xvst(bgr2, (__m256i*)ptr, 64);
2759 const __m256i sh_b = _v256_setr_b(
2760 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2761 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2762 const __m256i sh_g = _v256_setr_b(
2763 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2764 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2765 const __m256i sh_r = _v256_setr_b(
2766 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2767 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2769 __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
2770 __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
2771 __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
2773 const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2774 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2775 const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2776 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2778 __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
2779 __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
2780 __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
2782 __m256i bgr0 = __lasx_xvpermi_q(p2, p0, 0 + 2*16);
2783 __m256i bgr2 = __lasx_xvpermi_q(p2, p0, 1 + 3*16);
2785 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2786 __lasx_xvst(p1, (__m256i*)ptr, 16*2);
2787 __lasx_xvst(bgr2, (__m256i*)ptr, 32*2);
2790 inline void v_store_interleave(
unsigned* ptr,
const v_uint32x8& a,
const v_uint32x8& b,
const v_uint32x8& c,
2793 __m256i b0 = __lasx_xvshuf4i_w(a.val, 0x6c);
2794 __m256i g0 = __lasx_xvshuf4i_w(b.val, 0xb1);
2795 __m256i r0 = __lasx_xvshuf4i_w(c.val, 0xc6);
2797 __m256i bitmask_1 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
2798 __m256i bitmask_2 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
2800 __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, bitmask_1), r0, bitmask_2);
2801 __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, bitmask_1), b0, bitmask_2);
2802 __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, bitmask_1), g0, bitmask_2);
2804 __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
2805 __m256i bgr2 = __lasx_xvpermi_q(p1, p0, 1 + 3*16);
2807 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2808 __lasx_xvst(p2, (__m256i*)ptr, 8*4);
2809 __lasx_xvst(bgr2, (__m256i*)ptr, 16*4);
2815 __m256i s01 = __lasx_xvilvl_d(b.val, a.val);
2816 __m256i s12 = __lasx_xvilvh_d(c.val, b.val);
2817 __m256i s20 = __lasx_xvpermi_w(a.val, c.val, 0xe4);
2819 __m256i bgr0 = __lasx_xvpermi_q(s20, s01, 0 + 2*16);
2820 __m256i bgr1 = __lasx_xvpermi_q(s01, s12, 0x30);
2821 __m256i bgr2 = __lasx_xvpermi_q(s12, s20, 1 + 3*16);
2823 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2824 __lasx_xvst(bgr1, (__m256i*)ptr, 4*8);
2825 __lasx_xvst(bgr2, (__m256i*)ptr, 8*8);
2829 const v_uint8x32& c,
const v_uint8x32& d,
2832 __m256i bg0 = __lasx_xvilvl_b(b.val, a.val);
2833 __m256i bg1 = __lasx_xvilvh_b(b.val, a.val);
2834 __m256i ra0 = __lasx_xvilvl_b(d.val, c.val);
2835 __m256i ra1 = __lasx_xvilvh_b(d.val, c.val);
2837 __m256i bgra0_ = __lasx_xvilvl_h(ra0, bg0);
2838 __m256i bgra1_ = __lasx_xvilvh_h(ra0, bg0);
2839 __m256i bgra2_ = __lasx_xvilvl_h(ra1, bg1);
2840 __m256i bgra3_ = __lasx_xvilvh_h(ra1, bg1);
2842 __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2843 __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2844 __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2845 __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2847 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2848 __lasx_xvst(bgra1, (__m256i*)ptr, 32);
2849 __lasx_xvst(bgra2, (__m256i*)ptr, 64);
2850 __lasx_xvst(bgra3, (__m256i*)ptr, 96);
2854 const v_uint16x16& c,
const v_uint16x16& d,
2857 __m256i bg0 = __lasx_xvilvl_h(b.val, a.val);
2858 __m256i bg1 = __lasx_xvilvh_h(b.val, a.val);
2859 __m256i ra0 = __lasx_xvilvl_h(d.val, c.val);
2860 __m256i ra1 = __lasx_xvilvh_h(d.val, c.val);
2862 __m256i bgra0_ = __lasx_xvilvl_w(ra0, bg0);
2863 __m256i bgra1_ = __lasx_xvilvh_w(ra0, bg0);
2864 __m256i bgra2_ = __lasx_xvilvl_w(ra1, bg1);
2865 __m256i bgra3_ = __lasx_xvilvh_w(ra1, bg1);
2867 __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2868 __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2869 __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2870 __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2872 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2873 __lasx_xvst(bgra1, (__m256i*)ptr, 16*2);
2874 __lasx_xvst(bgra2, (__m256i*)ptr, 32*2);
2875 __lasx_xvst(bgra3, (__m256i*)ptr, 48*2);
2878 inline void v_store_interleave(
unsigned* ptr,
const v_uint32x8& a,
const v_uint32x8& b,
2879 const v_uint32x8& c,
const v_uint32x8& d,
2882 __m256i bg0 = __lasx_xvilvl_w(b.val, a.val);
2883 __m256i bg1 = __lasx_xvilvh_w(b.val, a.val);
2884 __m256i ra0 = __lasx_xvilvl_w(d.val, c.val);
2885 __m256i ra1 = __lasx_xvilvh_w(d.val, c.val);
2887 __m256i bgra0_ = __lasx_xvilvl_d(ra0, bg0);
2888 __m256i bgra1_ = __lasx_xvilvh_d(ra0, bg0);
2889 __m256i bgra2_ = __lasx_xvilvl_d(ra1, bg1);
2890 __m256i bgra3_ = __lasx_xvilvh_d(ra1, bg1);
2892 __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2893 __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2894 __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2895 __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2897 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2898 __lasx_xvst(bgra1, (__m256i*)ptr, 8*4);
2899 __lasx_xvst(bgra2, (__m256i*)ptr, 16*4);
2900 __lasx_xvst(bgra3, (__m256i*)ptr, 24*4);
2904 const v_uint64x4& c,
const v_uint64x4& d,
2907 __m256i bg0 = __lasx_xvilvl_d(b.val, a.val);
2908 __m256i bg1 = __lasx_xvilvh_d(b.val, a.val);
2909 __m256i ra0 = __lasx_xvilvl_d(d.val, c.val);
2910 __m256i ra1 = __lasx_xvilvh_d(d.val, c.val);
2912 __m256i bgra0 = __lasx_xvpermi_q(ra0, bg0, 0 + 2*16);
2913 __m256i bgra1 = __lasx_xvpermi_q(ra1, bg1, 0 + 2*16);
2914 __m256i bgra2 = __lasx_xvpermi_q(ra0, bg0, 1 + 3*16);
2915 __m256i bgra3 = __lasx_xvpermi_q(ra1, bg1, 1 + 3*16);
2917 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2918 __lasx_xvst(bgra1, (__m256i*)(ptr), 4*8);
2919 __lasx_xvst(bgra2, (__m256i*)(ptr), 8*8);
2920 __lasx_xvst(bgra3, (__m256i*)(ptr), 12*8);
2924 #define OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2925 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2928 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2929 a0 = v_reinterpret_as_##suffix0(a1); \
2930 b0 = v_reinterpret_as_##suffix0(b1); \
2932 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2934 _Tpvec1 a1, b1, c1; \
2935 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2936 a0 = v_reinterpret_as_##suffix0(a1); \
2937 b0 = v_reinterpret_as_##suffix0(b1); \
2938 c0 = v_reinterpret_as_##suffix0(c1); \
2940 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2942 _Tpvec1 a1, b1, c1, d1; \
2943 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2944 a0 = v_reinterpret_as_##suffix0(a1); \
2945 b0 = v_reinterpret_as_##suffix0(b1); \
2946 c0 = v_reinterpret_as_##suffix0(c1); \
2947 d0 = v_reinterpret_as_##suffix0(d1); \
2949 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2950 hal::StoreMode =hal::STORE_UNALIGNED ) \
2952 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2953 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2954 v_store_interleave((_Tp1*)ptr, a1, b1); \
2956 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
2957 hal::StoreMode =hal::STORE_UNALIGNED ) \
2959 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2960 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2961 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2962 v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
2964 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2965 const _Tpvec0& c0, const _Tpvec0& d0, \
2966 hal::StoreMode =hal::STORE_UNALIGNED ) \
2968 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2969 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2970 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2971 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2972 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
2975 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int8x32,
schar, s8, v_uint8x32,
uchar, u8)
2976 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int16x16,
short, s16, v_uint16x16,
ushort, u16)
2977 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int32x8,
int, s32, v_uint32x8,
unsigned, u32)
2978 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float32x8,
float, f32, v_uint32x8,
unsigned, u32)
2979 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int64x4,
int64, s64, v_uint64x4,
uint64, u64)
2980 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4,
double, f64, v_uint64x4,
uint64, u64)
2986 inline v_float32x8 v256_load_expand(
const hfloat* ptr)
2990 return v_float32x8(__lasx_xvfcvtl_s_h(__lasx_xvpermi_d(__lsx_vld((
const __m128i*)ptr, 0), 0x10)));
2993 for (
int i = 0; i < 8; i++)
2994 buf[i] = (
float)ptr[i];
2995 return v256_load_aligned(buf);
2999 inline void v_pack_store(hfloat* ptr,
const v_float32x8& a)
3002 __m256i ah = __lasx_xvfcvt_h_s(a.val, a.val);
3003 __lsx_vst((_m128i)ah, ptr, 0);
3007 for (
int i = 0; i < 8; i++)
3008 ptr[i] = hfloat(buf[i]);
3016 inline void v256_cleanup() {}
3018 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition: intrin_cpp.hpp:1515
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition: intrin_cpp.hpp:1554
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2216
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition: intrin_cpp.hpp:2761
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition: intrin_cpp.hpp:1496
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1451
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
StoreMode
Definition: intrin.hpp:100
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
static CV__DEBUG_NS_BEGIN void swap(MatExpr &a, MatExpr &b)
Definition: mat.inl.hpp:3409
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition: dualquaternion.inl.hpp:274