5#ifndef OPENCV_HAL_INTRIN_LASX_HPP
6#define OPENCV_HAL_INTRIN_LASX_HPP
12#define CV_SIMD256_64F 1
13#define CV_SIMD256_FP16 0
20CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
24inline __m256i _v256_setr_b(
char v0,
char v1,
char v2,
char v3,
char v4,
char v5,
char v6,
char v7,
char v8,
char v9,
25 char v10,
char v11,
char v12,
char v13,
char v14,
char v15,
char v16,
char v17,
char v18,
char v19,
26 char v20,
char v21,
char v22,
char v23,
char v24,
char v25,
char v26,
char v27,
char v28,
char v29,
29 return (__m256i)v32i8{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
30 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
31 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
35inline __m256i _v256_set_b(
char v0,
char v1,
char v2,
char v3,
char v4,
char v5,
char v6,
char v7,
char v8,
char v9,
36 char v10,
char v11,
char v12,
char v13,
char v14,
char v15,
char v16,
char v17,
char v18,
char v19,
37 char v20,
char v21,
char v22,
char v23,
char v24,
char v25,
char v26,
char v27,
char v28,
char v29,
40 return (__m256i)v32i8{ v31, v30,
41 v29, v28, v27, v26, v25, v24, v23, v22, v21, v20,
42 v19, v18, v17, v16, v15, v14, v13, v12, v11, v10,
43 v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 };
46inline __m256i _v256_setr_h(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7,
47 short v8,
short v9,
short v10,
short v11,
short v12,
short v13,
short v14,
short v15)
49 return (__m256i)v16i16{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 };
52inline __m256i _v256_setr_w(
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7)
54 return (__m256i)v8i32{ v0, v1, v2, v3, v4, v5, v6, v7 };
57inline __m256i _v256_set_w(
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7)
59 return (__m256i)v8i32{ v7, v6, v5, v4, v3, v2, v1, v0 };
62inline __m256i _v256_setall_w(
int v0)
64 return (__m256i)v8i32{ v0, v0, v0, v0, v0, v0, v0, v0 };
69 return (__m256i)v4i64{ v0, v1, v2, v3 };
74 return (__m256i)v4i64{ v3, v2, v1, v0 };
77inline __m256 _v256_setr_ps(
float v0,
float v1,
float v2,
float v3,
float v4,
float v5,
float v6,
float v7)
79 return (__m256)v8f32{ v0, v1, v2, v3, v4, v5, v6, v7 };
82inline __m256 _v256_setall_ps(
float f32)
84 return (__m256)v8f32{ f32, f32, f32, f32, f32, f32, f32, f32 };
87inline __m256d _v256_setr_pd(
double v0,
double v1,
double v2,
double v3)
89 return (__m256d)v4f64{ v0, v1, v2, v3 };
92inline __m256d _v256_setall_pd(
double f64)
94 return (__m256d)v4f64{ f64, f64, f64, f64 };
97inline __m256i _lasx_packus_h(
const __m256i& a,
const __m256i& b)
99 return __lasx_xvssrarni_bu_h(b, a, 0);
102inline __m256i _lasx_packs_h(
const __m256i& a,
const __m256i& b)
104 return __lasx_xvssrarni_b_h(b, a, 0);
107inline __m256i _lasx_packus_w(
const __m256i& a,
const __m256i& b)
109 return __lasx_xvssrarni_hu_w(b, a, 0);
112inline __m256i _lasx_packs_w(
const __m256i& a,
const __m256i& b)
114 return __lasx_xvssrarni_h_w(b, a, 0);
117inline __m256i _v256_combine(
const __m128i& lo,
const __m128i& hi)
118{
return __lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02); }
120inline __m256 _v256_combine(
const __m128& lo,
const __m128& hi)
121{
return __m256(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
123inline __m256d _v256_combine(
const __m128d& lo,
const __m128d& hi)
124{
return __m256d(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
126inline __m256i _v256_shuffle_odd_64(
const __m256i& v)
127{
return __lasx_xvpermi_d(v, 0xd8); }
129inline __m256d _v256_shuffle_odd_64(
const __m256d& v)
130{
return __m256d(__lasx_xvpermi_d(*((__m256i*)&v), 0xd8)); }
134inline __m256i _v256_permute2x128(
const __m256i& a,
const __m256i& b)
135{
return __lasx_xvpermi_q(a, b, imm); }
138inline __m256 _v256_permute2x128(
const __m256& a,
const __m256& b)
139{
return __m256(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
142inline __m256d _v256_permute2x128(
const __m256d& a,
const __m256d& b)
143{
return __m256d(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
145template<
int imm,
typename _Tpvec>
146inline _Tpvec v256_permute2x128(
const _Tpvec& a,
const _Tpvec& b)
147{
return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
150inline __m256i _v256_permute4x64(
const __m256i& a)
151{
return __lasx_xvpermi_d(a, imm); }
154inline __m256d _v256_permute4x64(
const __m256d& a)
155{
return __m256d(__lasx_xvpermi_d(*((__m256i*)&a), imm)); }
157template<
int imm,
typename _Tpvec>
158inline _Tpvec v256_permute4x64(
const _Tpvec& a)
159{
return _Tpvec(_v256_permute4x64<imm>(a.val)); }
161inline __m128i _v256_extract_high(
const __m256i& v)
162{ __m256i temp256i = __lasx_xvpermi_d(v, 0x4E);
163 return *((__m128i*)&temp256i); }
165inline __m128 _v256_extract_high(
const __m256& v)
166{
return __m128(_v256_extract_high(*((__m256i*)&v))); }
168inline __m128d _v256_extract_high(
const __m256d& v)
169{
return __m128d(_v256_extract_high(*((__m256i*)&v))); }
171inline __m128i _v256_extract_low(
const __m256i& v)
172{
return *((__m128i*)&v); }
174inline __m128 _v256_extract_low(
const __m256& v)
175{
return __m128(_v256_extract_low(*((__m256i*)&v))); }
177inline __m128d _v256_extract_low(
const __m256d& v)
178{
return __m128d(_v256_extract_low(*((__m256i*)&v))); }
180inline __m256i _v256_packs_epu32(
const __m256i& a,
const __m256i& b)
182 return __lasx_xvssrlrni_hu_w(b, a, 0);
186inline int _v256_extract_b(
const __m256i& a)
189 __lasx_xvstelm_b(a, des, 0, i);
194inline int _v256_extract_h(
const __m256i& a)
197 __lasx_xvstelm_h(a, des, 0, i);
202inline int _v256_extract_w(
const __m256i& a)
204 return __lasx_xvpickve2gr_w(a, i);
208inline int64 _v256_extract_d(
const __m256i& a)
210 return __lasx_xvpickve2gr_d(a, i);
217 typedef uchar lane_type;
218 enum { nlanes = 32 };
221 explicit v_uint8x32(__m256i v) : val(v) {}
231 val = _v256_setr_b((
char)v0, (
char)v1, (
char)v2, (
char)v3,
232 (
char)v4, (
char)v5, (
char)v6 , (
char)v7, (
char)v8, (
char)v9,
233 (
char)v10, (
char)v11, (
char)v12, (
char)v13, (
char)v14, (
char)v15,
234 (
char)v16, (
char)v17, (
char)v18, (
char)v19, (
char)v20, (
char)v21,
235 (
char)v22, (
char)v23, (
char)v24, (
char)v25, (
char)v26, (
char)v27,
236 (
char)v28, (
char)v29, (
char)v30, (
char)v31);
243 __lasx_xvstelm_b(val, des, 0, 0);
250 typedef schar lane_type;
251 enum { nlanes = 32 };
254 explicit v_int8x32(__m256i v) : val(v) {}
264 val = _v256_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
265 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
266 v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
273 __lasx_xvstelm_b(val, des, 0, 0);
281 enum { nlanes = 16 };
284 explicit v_uint16x16(__m256i v) : val(v) {}
290 val = _v256_setr_h((
short)v0, (
short)v1, (
short)v2, (
short)v3,
291 (
short)v4, (
short)v5, (
short)v6, (
short)v7, (
short)v8, (
short)v9,
292 (
short)v10, (
short)v11, (
short)v12, (
short)v13, (
short)v14, (
short)v15);
299 __lasx_xvstelm_h(val, des, 0, 0);
306 typedef short lane_type;
307 enum { nlanes = 16 };
310 explicit v_int16x16(__m256i v) : val(v) {}
311 v_int16x16(
short v0,
short v1,
short v2,
short v3,
312 short v4,
short v5,
short v6,
short v7,
313 short v8,
short v9,
short v10,
short v11,
314 short v12,
short v13,
short v14,
short v15)
316 val = _v256_setr_h(v0, v1, v2, v3, v4, v5, v6, v7,
317 v8, v9, v10, v11, v12, v13, v14, v15);
324 __lasx_xvstelm_h(val, des, 0, 0);
331 typedef unsigned lane_type;
335 explicit v_uint32x8(__m256i v) : val(v) {}
336 v_uint32x8(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3,
337 unsigned v4,
unsigned v5,
unsigned v6,
unsigned v7)
339 val = _v256_setr_w((
unsigned)v0, (
unsigned)v1, (
unsigned)v2,
340 (
unsigned)v3, (
unsigned)v4, (
unsigned)v5, (
unsigned)v6, (
unsigned)v7);
345 unsigned get0()
const {
return __lasx_xvpickve2gr_wu(val, 0); }
350 typedef int lane_type;
354 explicit v_int32x8(__m256i v) : val(v) {}
355 v_int32x8(
int v0,
int v1,
int v2,
int v3,
356 int v4,
int v5,
int v6,
int v7)
358 val = _v256_setr_w(v0, v1, v2, v3, v4, v5, v6, v7);
363 int get0()
const {
return __lasx_xvpickve2gr_w(val, 0); }
368 typedef float lane_type;
372 explicit v_float32x8(__m256 v) : val(v) {}
373 explicit v_float32x8(__m256i v) { val = *((__m256*)&v); }
374 v_float32x8(
float v0,
float v1,
float v2,
float v3,
375 float v4,
float v5,
float v6,
float v7)
377 val = _v256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
384 __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
388 int get0toint()
const {
390 __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
401 explicit v_uint64x4(__m256i v) : val(v) {}
409 return __lasx_xvpickve2gr_du(val, 0);
415 typedef int64 lane_type;
419 explicit v_int64x4(__m256i v) : val(v) {}
421 { val = _v256_setr_d(v0, v1, v2, v3); }
427 return __lasx_xvpickve2gr_d(val, 0);
433 typedef double lane_type;
437 explicit v_float64x4(__m256d v) : val(v) {}
438 explicit v_float64x4(__m256i v) { val = *((__m256d*)&v); }
439 v_float64x4(
double v0,
double v1,
double v2,
double v3)
440 { val = _v256_setr_pd(v0, v1, v2, v3); }
444 double get0()
const {
446 __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
450 int64 get0toint64()
const {
452 __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
459#define OPENCV_HAL_IMPL_LASX_LOADSTORE(_Tpvec, _Tp) \
460 inline _Tpvec v256_load(const _Tp* ptr) \
461 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
462 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
463 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
464 inline _Tpvec v256_load_low(const _Tp* ptr) \
466 __m128i v128 = __lsx_vld(ptr, 0); \
467 return _Tpvec(*((__m256i*)&v128)); \
469 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
471 __m128i vlo = __lsx_vld(ptr0, 0); \
472 __m128i vhi = __lsx_vld(ptr1, 0); \
473 return _Tpvec(_v256_combine(vlo, vhi)); \
475 inline void v_store(_Tp* ptr, const _Tpvec& a) \
476 { __lasx_xvst(a.val, ptr, 0); } \
477 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
478 { __lasx_xvst(a.val, ptr, 0); } \
479 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
480 { __lasx_xvst(a.val, ptr, 0); } \
481 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
483 if( mode == hal::STORE_UNALIGNED ) \
484 __lasx_xvst(a.val, ptr, 0); \
485 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
486 __lasx_xvst(a.val, ptr, 0); \
488 __lasx_xvst(a.val, ptr, 0); \
490 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
491 { __lsx_vst(_v256_extract_low(a.val), ptr, 0); } \
492 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
493 { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
495OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint8x32,
uchar)
496OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int8x32,
schar)
497OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint16x16,
ushort)
498OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int16x16,
short)
499OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint32x8,
unsigned)
500OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int32x8,
int)
501OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint64x4,
uint64)
502OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int64x4,
int64)
505#define OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg) \
506 inline _Tpvec v256_load(const _Tp* ptr) \
507 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
508 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
509 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
510 inline _Tpvec v256_load_low(const _Tp* ptr) \
512 __m128i v128 = __lsx_vld(ptr, 0); \
513 return _Tpvec(*((__m256i*)&v128)); \
515 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
517 halfreg vlo = __lsx_vld(ptr0, 0); \
518 halfreg vhi = __lsx_vld(ptr1, 0); \
519 return _Tpvec(_v256_combine(vlo, vhi)); \
521 inline void v_store(_Tp* ptr, const _Tpvec& a) \
522 { __lasx_xvst(a.val, ptr, 0); } \
523 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
524 { __lasx_xvst(a.val, ptr, 0); } \
525 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
526 { __lasx_xvst(a.val, ptr, 0); } \
527 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
529 if( mode == hal::STORE_UNALIGNED ) \
530 __lasx_xvst(a.val, ptr, 0); \
531 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
532 __lasx_xvst(a.val, ptr, 0); \
534 __lasx_xvst(a.val, ptr, 0); \
536 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
537 { __lsx_vst(_v256_extract_low(a.val), ptr, 0); } \
538 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
539 { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
541OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float32x8,
float, __m128i)
542OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float64x4,
double, __m128i)
545inline __m256i _lasx_256_castps_si256(
const __m256& v)
546{
return __m256i(v); }
548inline __m256i _lasx_256_castpd_si256(
const __m256d& v)
549{
return __m256i(v); }
551#define OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
552 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
553 { return _Tpvec(cast(a.val)); }
555#define OPENCV_HAL_IMPL_LASX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
556 inline _Tpvec v256_setzero_##suffix() \
557 { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
558 inline _Tpvec v256_setall_##suffix(_Tp v) \
559 { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); } \
560 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
561 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
562 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
563 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
564 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
565 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
566 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
567 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
568 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float32x8, suffix, _lasx_256_castps_si256) \
569 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float64x4, suffix, _lasx_256_castpd_si256)
571OPENCV_HAL_IMPL_LASX_INIT(v_uint8x32,
uchar, u8, b,
int)
572OPENCV_HAL_IMPL_LASX_INIT(v_int8x32,
schar, s8, b,
int)
573OPENCV_HAL_IMPL_LASX_INIT(v_uint16x16,
ushort, u16, h,
int)
574OPENCV_HAL_IMPL_LASX_INIT(v_int16x16,
short, s16, h,
int)
575OPENCV_HAL_IMPL_LASX_INIT(v_uint32x8,
unsigned, u32, w,
int)
576OPENCV_HAL_IMPL_LASX_INIT(v_int32x8,
int, s32, w,
int)
577OPENCV_HAL_IMPL_LASX_INIT(v_uint64x4,
uint64, u64, d,
long int)
578OPENCV_HAL_IMPL_LASX_INIT(v_int64x4,
int64, s64, d,
long int)
581inline __m256 _lasx_256_castsi256_ps(
const __m256i &v)
584inline __m256d _lasx_256_castsi256_pd(
const __m256i &v)
585{
return __m256d(v); }
587#define OPENCV_HAL_IMPL_LASX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
588 inline _Tpvec v256_setzero_##suffix() \
589 { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
590 inline _Tpvec v256_setall_##suffix(_Tp v) \
591 { return _Tpvec(_v256_setall_##zsuffix(v)); } \
592 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
593 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, cast) \
594 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
595 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16, suffix, cast) \
596 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
597 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8, suffix, cast) \
598 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
599 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4, suffix, cast)
601OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float32x8,
float, f32, ps, _lasx_256_castsi256_ps)
602OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float64x4,
double, f64, pd, _lasx_256_castsi256_pd)
604inline v_float32x8 v_reinterpret_as_f32(
const v_float32x8& a)
606inline v_float32x8 v_reinterpret_as_f32(
const v_float64x4& a)
607{
return v_float32x8(_lasx_256_castps_si256(__m256(a.val))); }
609inline v_float64x4 v_reinterpret_as_f64(
const v_float64x4& a)
611inline v_float64x4 v_reinterpret_as_f64(
const v_float32x8& a)
612{
return v_float64x4(_lasx_256_castpd_si256(__m256d(a.val))); }
618#define OPENCV_HAL_IMPL_LASX_UNPACK(_Tpvec, suffix) \
619 inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
620 { return _Tpvec(__lasx_xvilvl_##suffix(__m256i(b.val), __m256i(a.val))); } \
621 inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
622 { return _Tpvec(__lasx_xvilvh_##suffix(__m256i(b.val), __m256i(a.val))); }
624OPENCV_HAL_IMPL_LASX_UNPACK(v_uint8x32, b)
625OPENCV_HAL_IMPL_LASX_UNPACK(v_int8x32, b)
626OPENCV_HAL_IMPL_LASX_UNPACK(v_uint16x16, h)
627OPENCV_HAL_IMPL_LASX_UNPACK(v_int16x16, h)
628OPENCV_HAL_IMPL_LASX_UNPACK(v_uint32x8, w)
629OPENCV_HAL_IMPL_LASX_UNPACK(v_int32x8, w)
630OPENCV_HAL_IMPL_LASX_UNPACK(v_uint64x4, d)
631OPENCV_HAL_IMPL_LASX_UNPACK(v_int64x4, d)
632OPENCV_HAL_IMPL_LASX_UNPACK(v_float32x8, w)
633OPENCV_HAL_IMPL_LASX_UNPACK(v_float64x4, d)
638#define OPENCV_HAL_IMPL_LASX_SHUFFLE(_Tpvec, intrin) \
640 inline _Tpvec v256_shuffle(const _Tpvec& a) \
641 { return _Tpvec(__lasx_xvshuf4i_##intrin(a.val, m)); }
643OPENCV_HAL_IMPL_LASX_SHUFFLE(v_uint32x8, w)
644OPENCV_HAL_IMPL_LASX_SHUFFLE(v_int32x8, w)
647inline v_float32x8 v256_shuffle(
const v_float32x8 &a)
648{
return v_float32x8(__lasx_xvshuf4i_w(*((__m256i*)&a.val), m)); }
651inline v_float64x4 v256_shuffle(
const v_float64x4 &a)
653 int imm8 = m & 0b0001;
654 if (m & 0x0b0010) imm8 |= 0b0100;
656 if (m & 0x0b0100) imm8 |= 0b110000;
657 else imm8 |= 0b100000;
658 if (m & 0x0b1000) imm8 |= 0b11000000;
659 else imm8 |= 0b10000000;
661 return v_float64x4(__lasx_xvpermi_d(*((__m256i*)&a.val), imm8));
663template<
typename _Tpvec>
664inline void v256_zip(
const _Tpvec& a,
const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
666 ab0 = v256_unpacklo(a, b);
667 ab1 = v256_unpackhi(a, b);
670template<
typename _Tpvec>
671inline _Tpvec v256_combine_diagonal(
const _Tpvec& a,
const _Tpvec& b)
672{
return _Tpvec(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
674inline v_float32x8 v256_combine_diagonal(
const v_float32x8& a,
const v_float32x8& b)
675{
return v_float32x8(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
677inline v_float64x4 v256_combine_diagonal(
const v_float64x4& a,
const v_float64x4& b)
678{
return v_float64x4(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
680template<
typename _Tpvec>
681inline _Tpvec v256_alignr_128(
const _Tpvec& a,
const _Tpvec& b)
682{
return v256_permute2x128<0x03>(a, b); }
684inline __m256i _v256_alignr_b(
const __m256i &a,
const __m256i &b,
const int imm)
687 return __lasx_xvshuf4i_d(b, a, 0x9);
689 __m256i byteIndex = _v256_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
690 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
691 return __lasx_xvshuf_b(a, b, __lasx_xvadd_b(__lasx_xvreplgr2vr_b(imm), byteIndex));
695template<
typename _Tpvec>
696inline _Tpvec v256_alignr_64(
const _Tpvec& a,
const _Tpvec& b)
697{
return _Tpvec(_v256_alignr_b(a.val, b.val, 8)); }
698inline v_float64x4 v256_alignr_64(
const v_float64x4& a,
const v_float64x4& b)
699{
return v_float64x4(__lasx_xvshuf4i_d(b.val, a.val, 0x9)); }
702template<
typename _Tpvec>
703inline _Tpvec v256_swap_halves(
const _Tpvec& a)
704{
return v256_permute2x128<1>(a, a); }
706template<
typename _Tpvec>
707inline _Tpvec v256_reverse_64(
const _Tpvec& a)
708{
return v256_permute4x64<0x1b>(a); }
712#define OPENCV_HAL_IMPL_LASX_ZIP(_Tpvec) \
713 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
714 { return v256_permute2x128<0x02>(a, b); } \
715 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
716 { return v256_permute2x128<0x13>(a, b); } \
717 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
718 _Tpvec& c, _Tpvec& d) \
720 _Tpvec a1b0 = v256_alignr_128(a, b); \
721 c = v256_combine_diagonal(a, a1b0); \
722 d = v256_combine_diagonal(a1b0, b); \
724 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
725 _Tpvec& ab0, _Tpvec& ab1) \
727 _Tpvec ab0ab2, ab1ab3; \
728 v256_zip(a, b, ab0ab2, ab1ab3); \
729 v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
732OPENCV_HAL_IMPL_LASX_ZIP(v_uint8x32)
733OPENCV_HAL_IMPL_LASX_ZIP(v_int8x32)
734OPENCV_HAL_IMPL_LASX_ZIP(v_uint16x16)
735OPENCV_HAL_IMPL_LASX_ZIP(v_int16x16)
736OPENCV_HAL_IMPL_LASX_ZIP(v_uint32x8)
737OPENCV_HAL_IMPL_LASX_ZIP(v_int32x8)
738OPENCV_HAL_IMPL_LASX_ZIP(v_uint64x4)
739OPENCV_HAL_IMPL_LASX_ZIP(v_int64x4)
740OPENCV_HAL_IMPL_LASX_ZIP(v_float32x8)
741OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
746#define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin) \
747 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
748 { return _Tpvec(intrin(a.val, b.val)); } \
749 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
750 { a.val = intrin(a.val, b.val); return a; }
752OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32, __lasx_xvsadd_bu)
753OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32, __lasx_xvssub_bu)
754OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32, __lasx_xvsadd_b)
755OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32, __lasx_xvssub_b)
756OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
757OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
758OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16, __lasx_xvsadd_h)
759OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16, __lasx_xvssub_h)
760OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8, __lasx_xvadd_w)
761OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8, __lasx_xvsub_w)
762OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8, __lasx_xvmul_w)
763OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8, __lasx_xvadd_w)
764OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8, __lasx_xvsub_w)
765OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8, __lasx_xvmul_w)
766OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4, __lasx_xvadd_d)
767OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4, __lasx_xvsub_d)
768OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4, __lasx_xvadd_d)
769OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4, __lasx_xvsub_d)
771OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
772OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
773OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
774OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
775OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
776OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
777OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
778OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)
781inline v_uint8x32
operator * (
const v_uint8x32& a,
const v_uint8x32& b)
787inline v_int8x32
operator * (
const v_int8x32& a,
const v_int8x32& b)
793inline v_uint16x16
operator * (
const v_uint16x16& a,
const v_uint16x16& b)
795 __m256i pl = __lasx_xvmul_h(a.val, b.val);
796 __m256i ph = __lasx_xvmuh_hu(a.val, b.val);
797 __m256i p0 = __lasx_xvilvl_h(ph, pl);
798 __m256i p1 = __lasx_xvilvh_h(ph, pl);
799 return v_uint16x16(_v256_packs_epu32(p0, p1));
801inline v_int16x16
operator * (
const v_int16x16& a,
const v_int16x16& b)
803 __m256i pl = __lasx_xvmul_h(a.val, b.val);
804 __m256i ph = __lasx_xvmuh_h(a.val, b.val);
805 __m256i p0 = __lasx_xvilvl_h(ph, pl);
806 __m256i p1 = __lasx_xvilvh_h(ph, pl);
807 return v_int16x16(_lasx_packs_w(p0, p1));
809inline v_uint8x32&
operator *= (v_uint8x32& a,
const v_uint8x32& b)
810{ a = a * b;
return a; }
811inline v_int8x32&
operator *= (v_int8x32& a,
const v_int8x32& b)
812{ a = a * b;
return a; }
813inline v_uint16x16&
operator *= (v_uint16x16& a,
const v_uint16x16& b)
814{ a = a * b;
return a; }
815inline v_int16x16&
operator *= (v_int16x16& a,
const v_int16x16& b)
816{ a = a * b;
return a; }
820#define OPENCV_HAL_IMPL_LASX_BIN_FUNC(func, _Tpvec, intrin) \
821 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
822 { return _Tpvec(intrin(a.val, b.val)); }
824OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint8x32, __lasx_xvadd_b)
825OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int8x32, __lasx_xvadd_b)
826OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint16x16, __lasx_xvadd_h)
827OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int16x16, __lasx_xvadd_h)
828OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint8x32, __lasx_xvsub_b)
829OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int8x32, __lasx_xvsub_b)
830OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint16x16, __lasx_xvsub_h)
831OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int16x16, __lasx_xvsub_h)
832OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_uint16x16, __lasx_xvmul_h)
833OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16, __lasx_xvmul_h)
835inline v_uint8x32 v_mul_wrap(
const v_uint8x32& a,
const v_uint8x32& b)
837 __m256i p0 = __lasx_xvmulwev_h_bu(a.val, b.val);
838 __m256i p1 = __lasx_xvmulwod_h_bu(a.val, b.val);
839 return v_uint8x32(__lasx_xvpackev_b(p1, p0));
842inline v_int8x32 v_mul_wrap(
const v_int8x32& a,
const v_int8x32& b)
844 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
848inline void v_mul_expand(
const v_uint8x32& a,
const v_uint8x32& b,
849 v_uint16x16& c, v_uint16x16& d)
851 v_uint16x16 a0, a1, b0, b1;
854 c = v_mul_wrap(a0, b0);
855 d = v_mul_wrap(a1, b1);
858inline void v_mul_expand(
const v_int8x32& a,
const v_int8x32& b,
859 v_int16x16& c, v_int16x16& d)
861 v_int16x16 a0, a1, b0, b1;
864 c = v_mul_wrap(a0, b0);
865 d = v_mul_wrap(a1, b1);
868inline void v_mul_expand(
const v_int16x16& a,
const v_int16x16& b,
869 v_int32x8& c, v_int32x8& d)
871 v_int16x16 vhi = v_int16x16(__lasx_xvmuh_h(a.val, b.val));
874 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
876 c = v_reinterpret_as_s32(v0);
877 d = v_reinterpret_as_s32(v1);
880inline void v_mul_expand(
const v_uint16x16& a,
const v_uint16x16& b,
881 v_uint32x8& c, v_uint32x8& d)
883 v_uint16x16 vhi = v_uint16x16(__lasx_xvmuh_hu(a.val, b.val));
886 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
888 c = v_reinterpret_as_u32(v0);
889 d = v_reinterpret_as_u32(v1);
892inline void v_mul_expand(
const v_uint32x8& a,
const v_uint32x8& b,
893 v_uint64x4& c, v_uint64x4& d)
895 __m256i v0 = __lasx_xvmulwev_d_wu(a.val, b.val);
896 __m256i v1 = __lasx_xvmulwod_d_wu(a.val, b.val);
897 v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
900inline v_int16x16
v_mul_hi(
const v_int16x16& a,
const v_int16x16& b) {
return v_int16x16(__lasx_xvmuh_h(a.val, b.val)); }
901inline v_uint16x16
v_mul_hi(
const v_uint16x16& a,
const v_uint16x16& b) {
return v_uint16x16(__lasx_xvmuh_hu(a.val, b.val)); }
904#define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
905 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
906 { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
907 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
908 { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
909 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
910 { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
911 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
912 { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
914 inline _Tpuvec v_shl(const _Tpuvec& a) \
915 { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
917 inline _Tpsvec v_shl(const _Tpsvec& a) \
918 { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
920 inline _Tpuvec v_shr(const _Tpuvec& a) \
921 { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
923 inline _Tpsvec v_shr(const _Tpsvec& a) \
924 { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }
926OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h)
927OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8, v_int32x8, w, __lasx_xvsra_w)
928OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d)
932#define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const) \
933 OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix) \
934 OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \
935 OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \
936 inline _Tpvec operator ~ (const _Tpvec& a) \
937 { return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
939OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1))
940OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32, v, __lasx_xvreplgr2vr_w(-1))
941OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint16x16, v, __lasx_xvreplgr2vr_w(-1))
942OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int16x16, v, __lasx_xvreplgr2vr_w(-1))
943OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint32x8, v, __lasx_xvreplgr2vr_w(-1))
944OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int32x8, v, __lasx_xvreplgr2vr_w(-1))
945OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4, v, __lasx_xvreplgr2vr_d(-1))
946OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4, v, __lasx_xvreplgr2vr_d(-1))
948#define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
949 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
950 { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } \
951 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
952 { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }
954#define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast) \
955 OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast) \
956 OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast) \
957 OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast) \
958 inline _Tpvec operator ~ (const _Tpvec& a) \
959 { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
961OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8, v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
962OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float64x4, v, __lasx_xvreplgr2vr_d(-1), _lasx_256_castsi256_pd)
965#define OPENCV_HAL_IMPL_LASX_SELECT(_Tpvec) \
966 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
967 { return _Tpvec(__lasx_xvbitsel_v(b.val, a.val, mask.val)); }
969OPENCV_HAL_IMPL_LASX_SELECT(v_uint8x32)
970OPENCV_HAL_IMPL_LASX_SELECT(v_int8x32)
971OPENCV_HAL_IMPL_LASX_SELECT(v_uint16x16)
972OPENCV_HAL_IMPL_LASX_SELECT(v_int16x16)
973OPENCV_HAL_IMPL_LASX_SELECT(v_uint32x8)
974OPENCV_HAL_IMPL_LASX_SELECT(v_int32x8)
976inline v_float32x8
v_select(
const v_float32x8 &
mask,
const v_float32x8 &a,
const v_float32x8 &b)
977{
return v_float32x8(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&
mask.val))); }
979inline v_float64x4
v_select(
const v_float64x4 &
mask,
const v_float64x4 &a,
const v_float64x4 &b)
980{
return v_float64x4(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&
mask.val))); }
983#define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec) \
984 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
985 { return ~(a == b); } \
986 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
988 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
989 { return ~(a < b); } \
990 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
993#define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
994 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
995 { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
996 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
998 return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val)); \
1000 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1001 { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
1002 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1003 { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); } \
1004 OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec) \
1005 OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
1007OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint8x32, v_int8x32, b, bu)
1008OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
1009OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8, v_int32x8, w, wu)
1011#define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix) \
1012 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1013 { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
1014 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1015 { return ~(a == b); }
1017OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
1018OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
1020#define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
1021 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1022 { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
1024#define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix) \
1025 OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix) \
1026 OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix) \
1027 OPENCV_HAL_IMPL_LASX_CMP_FLT(<, xvfcmp_clt, _Tpvec, ssuffix) \
1028 OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)
1030OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
1031OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
1033inline v_float32x8 operator > (
const v_float32x8 &a,
const v_float32x8 &b)
1034{
return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
1036inline v_float32x8 operator >= (
const v_float32x8 &a,
const v_float32x8 &b)
1037{
return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
1039inline v_float64x4 operator > (
const v_float64x4 &a,
const v_float64x4 &b)
1040{
return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
1042inline v_float64x4 operator >= (
const v_float64x4 &a,
const v_float64x4 &b)
1043{
return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
1045inline v_float32x8
v_not_nan(
const v_float32x8& a)
1046{
return v_float32x8(__lasx_xvfcmp_cor_s(a.val, a.val)); }
1047inline v_float64x4
v_not_nan(
const v_float64x4& a)
1048{
return v_float64x4(__lasx_xvfcmp_cor_d(a.val, a.val)); }
1051OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint8x32, __lasx_xvmin_bu)
1052OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint8x32, __lasx_xvmax_bu)
1053OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int8x32, __lasx_xvmin_b)
1054OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int8x32, __lasx_xvmax_b)
1055OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint16x16, __lasx_xvmin_hu)
1056OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint16x16, __lasx_xvmax_hu)
1057OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int16x16, __lasx_xvmin_h)
1058OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int16x16, __lasx_xvmax_h)
1059OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint32x8, __lasx_xvmin_wu)
1060OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint32x8, __lasx_xvmax_wu)
1061OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int32x8, __lasx_xvmin_w)
1062OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int32x8, __lasx_xvmax_w)
1063OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float32x8, __lasx_xvfmin_s)
1064OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float32x8, __lasx_xvfmax_s)
1065OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float64x4, __lasx_xvfmin_d)
1066OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float64x4, __lasx_xvfmax_d)
1070inline v_uint8x32 v_rotate_left(
const v_uint8x32& a,
const v_uint8x32& b)
1072 enum {IMM_R = (16 - imm) & 0xFF};
1073 enum {IMM_R2 = (32 - imm) & 0xFF};
1075 if (imm == 0)
return a;
1076 if (imm == 32)
return b;
1077 if (imm > 32)
return v_uint8x32();
1079 __m256i
swap = _v256_permute2x128<0x21>(a.val, b.val);
1080 if (imm == 16)
return v_uint8x32(swap);
1081 if (imm < 16)
return v_uint8x32(_v256_alignr_b(a.val, swap, IMM_R));
1082 return v_uint8x32(_v256_alignr_b(swap, b.val, IMM_R2));
1086inline v_uint8x32 v_rotate_right(
const v_uint8x32& a,
const v_uint8x32& b)
1088 enum {IMM_L = (imm - 16) & 0xFF};
1090 if (imm == 0)
return a;
1091 if (imm == 32)
return b;
1092 if (imm > 32)
return v_uint8x32();
1094 __m256i
swap = _v256_permute2x128<0x03>(a.val, b.val);
1095 if (imm == 16)
return v_uint8x32(swap);
1096 if (imm < 16)
return v_uint8x32(_v256_alignr_b(swap, a.val, imm));
1097 return v_uint8x32(_v256_alignr_b(b.val, swap, IMM_L));
1101inline v_uint8x32 v_rotate_left(
const v_uint8x32& a)
1103 enum {IMM_L = (imm - 16) & 0xFF};
1104 enum {IMM_R = (16 - imm) & 0xFF};
1106 if (imm == 0)
return a;
1107 if (imm > 32)
return v_uint8x32();
1110 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1111 __m256i swapz = __lasx_xvpermi_q(a.val, vzero, 0x20);;
1112 if (imm == 16)
return v_uint8x32(swapz);
1113 if (imm < 16)
return v_uint8x32(_v256_alignr_b(a.val, swapz, IMM_R));
1114 return v_uint8x32(__lasx_xvbsll_v(swapz, IMM_L));
1118inline v_uint8x32 v_rotate_right(
const v_uint8x32& a)
1120 enum {IMM_L = (imm - 16) & 0xFF};
1122 if (imm == 0)
return a;
1123 if (imm > 32)
return v_uint8x32();
1126 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1127 __m256i swapz = __lasx_xvpermi_q(vzero, a.val, 0x21);;
1128 if (imm == 16)
return v_uint8x32(swapz);
1129 if (imm < 16)
return v_uint8x32(_v256_alignr_b(swapz, a.val, imm));
1130 return v_uint8x32(__lasx_xvbsrl_v(swapz, IMM_L));
1133#define OPENCV_HAL_IMPL_LASX_ROTATE_CAST(intrin, _Tpvec, cast) \
1135 inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
1137 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1138 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a), \
1139 v_reinterpret_as_u8(b)); \
1140 return _Tpvec(cast(ret.val)); \
1143 inline _Tpvec intrin(const _Tpvec& a) \
1145 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1146 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a)); \
1147 return _Tpvec(cast(ret.val)); \
1150#define OPENCV_HAL_IMPL_LASX_ROTATE(_Tpvec) \
1151 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
1152 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
1154OPENCV_HAL_IMPL_LASX_ROTATE(v_int8x32)
1155OPENCV_HAL_IMPL_LASX_ROTATE(v_uint16x16)
1156OPENCV_HAL_IMPL_LASX_ROTATE(v_int16x16)
1157OPENCV_HAL_IMPL_LASX_ROTATE(v_uint32x8)
1158OPENCV_HAL_IMPL_LASX_ROTATE(v_int32x8)
1159OPENCV_HAL_IMPL_LASX_ROTATE(v_uint64x4)
1160OPENCV_HAL_IMPL_LASX_ROTATE(v_int64x4)
1162OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, v_float32x8, _lasx_256_castsi256_ps)
1163OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float32x8, _lasx_256_castsi256_ps)
1164OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, v_float64x4, _lasx_256_castsi256_pd)
1165OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float64x4, _lasx_256_castsi256_pd)
1168inline v_uint8x32
v_reverse(
const v_uint8x32 &a)
1170 static const __m256i perm = _v256_setr_b(
1171 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
1172 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1173 __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
1174 return v_uint8x32(__lasx_xvpermi_q(vec, vec, 1));
1177inline v_int8x32
v_reverse(
const v_int8x32 &a)
1178{
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1180inline v_uint16x16
v_reverse(
const v_uint16x16 &a)
1182 __m256i vec = __lasx_xvshuf4i_h(a.val, 0x1B);
1183 vec = __lasx_xvshuf4i_w(vec, 0x4E);
1184 return v_uint16x16(__lasx_xvpermi_d(vec, 0x4E));
1187inline v_int16x16
v_reverse(
const v_int16x16 &a)
1188{
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
1190inline v_uint32x8
v_reverse(
const v_uint32x8 &a)
1192 __m256i vec = __lasx_xvshuf4i_w(a.val, 0x1B);
1193 return v_uint32x8(__lasx_xvpermi_d(vec, 0x4E));
1196inline v_int32x8
v_reverse(
const v_int32x8 &a)
1197{
return v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
1199inline v_float32x8
v_reverse(
const v_float32x8 &a)
1200{
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
1202inline v_uint64x4
v_reverse(
const v_uint64x4 &a)
1204 return v_uint64x4(__lasx_xvpermi_d(a.val, 0x1b));
1207inline v_int64x4
v_reverse(
const v_int64x4 &a)
1208{
return v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
1210inline v_float64x4
v_reverse(
const v_float64x4 &a)
1211{
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
1219 __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val);
1220 __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1221 __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1222 __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1223 return (
unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1228 __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val);
1229 __m256i t2 = __lasx_xvhaddw_w_h(t1, t1);
1230 __m256i t3 = __lasx_xvhaddw_d_w(t2, t2);
1231 __m256i t4 = __lasx_xvhaddw_q_d(t3, t3);
1232 return (
int)(((v8i32)t4)[0]+((v8i32)t4)[4]);
1235#define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \
1236 inline sctype v_reduce_##func(const _Tpvec& a) \
1238 __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
1239 val = intrin(val, __lsx_vbsrl_v(val,8)); \
1240 val = intrin(val, __lsx_vbsrl_v(val,4)); \
1241 val = intrin(val, __lsx_vbsrl_v(val,2)); \
1242 val = intrin(val, __lsx_vbsrl_v(val,1)); \
1243 return (sctype)__lsx_vpickve2gr_w(val, 0); \
1246OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32,
uchar, min, __lsx_vmin_bu)
1247OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32,
schar, min, __lsx_vmin_b)
1248OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32,
uchar, max, __lsx_vmax_bu)
1249OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32,
schar, max, __lsx_vmax_b)
1251#define OPENCV_HAL_IMPL_LASX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1252 inline sctype v_reduce_##func(const _Tpvec& a) \
1254 __m128i v0 = _v256_extract_low(a.val); \
1255 __m128i v1 = _v256_extract_high(a.val); \
1256 v0 = intrin(v0, v1); \
1257 v0 = intrin(v0, __lsx_vbsrl_v(v0, 8)); \
1258 v0 = intrin(v0, __lsx_vbsrl_v(v0, 4)); \
1259 v0 = intrin(v0, __lsx_vbsrl_v(v0, 2)); \
1260 return (sctype) __lsx_vpickve2gr_w(v0, 0); \
1263OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16,
ushort, min, __lsx_vmin_hu)
1264OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16,
short, min, __lsx_vmin_h)
1265OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16,
ushort, max, __lsx_vmax_hu)
1266OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16,
short, max, __lsx_vmax_h)
1268#define OPENCV_HAL_IMPL_LASX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1269 inline sctype v_reduce_##func(const _Tpvec& a) \
1271 __m128i v0 = _v256_extract_low(a.val); \
1272 __m128i v1 = _v256_extract_high(a.val); \
1273 v0 = intrin(v0, v1); \
1274 v0 = intrin(v0, __lsx_vbsrl_v(v0, 8)); \
1275 v0 = intrin(v0, __lsx_vbsrl_v(v0, 4)); \
1276 return (sctype) __lsx_vpickve2gr_w(v0, 0); \
1279OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8,
unsigned, min, __lsx_vmin_wu)
1280OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8,
int, min, __lsx_vmin_w)
1281OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8,
unsigned, max, __lsx_vmax_wu)
1282OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8,
int, max, __lsx_vmax_w)
1284#define OPENCV_HAL_IMPL_LASX_REDUCE_FLT(func, intrin) \
1285 inline float v_reduce_##func(const v_float32x8& a) \
1287 __m128 v0 = _v256_extract_low(a.val); \
1288 __m128 v1 = _v256_extract_high(a.val); \
1289 v0 = intrin(v0, v1); \
1290 v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x0e))); \
1291 v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x01))); \
1292 float *fvalue = (float*)&v0; \
1296OPENCV_HAL_IMPL_LASX_REDUCE_FLT(min, __lsx_vfmin_s)
1297OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s)
1301 __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val);
1302 __m256i t2 = __lasx_xvhaddw_q_d(t1, t1);
1303 return (
int)(((v8i32)t2)[0]+((v8i32)t2)[4]);
1317 float *pa = (
float*)&a;
1318 for (
int i = 0; i < 2; ++i) {
1319 result += pa[i*4] + pa[i*4+1] + pa[i*4+2] + pa[i*4+3];
1326 __m256i t0 = __lasx_xvhaddw_qu_du(a.val, a.val);
1327 return (
uint64)(((v4u64)t0)[0] + ((v4u64)t0)[2]);
1331 __m256i t0 = __lasx_xvhaddw_q_d(a.val, a.val);
1332 return (
int64)(((v4i64)t0)[0] + ((v4i64)t0)[2]);
1336 double *pa = (
double*)&a;
1337 return pa[0] + pa[1] + pa[2] + pa[3];
1340inline v_float32x8
v_reduce_sum4(
const v_float32x8& a,
const v_float32x8& b,
1341 const v_float32x8& c,
const v_float32x8& d)
1343 float *pa = (
float*)&a;
1344 float *pb = (
float*)&b;
1345 float *pc = (
float*)&c;
1346 float *pd = (
float*)&d;
1348 float v0 = pa[0] + pa[1] + pa[2] + pa[3];
1349 float v1 = pb[0] + pb[1] + pb[2] + pb[3];
1350 float v2 = pc[0] + pc[1] + pc[2] + pc[3];
1351 float v3 = pd[0] + pd[1] + pd[2] + pd[3];
1352 float v4 = pa[4] + pa[5] + pa[6] + pa[7];
1353 float v5 = pb[4] + pb[5] + pb[6] + pb[7];
1354 float v6 = pc[4] + pc[5] + pc[6] + pc[7];
1355 float v7 = pd[4] + pd[5] + pd[6] + pd[7];
1356 return v_float32x8(v0, v1, v2, v3, v4, v5, v6, v7);
1359inline unsigned v_reduce_sad(
const v_uint8x32& a,
const v_uint8x32& b)
1361 __m256i t0 = __lasx_xvabsd_bu(a.val, b.val);
1362 __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
1363 __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1364 __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1365 __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1366 return (
unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1368inline unsigned v_reduce_sad(
const v_int8x32& a,
const v_int8x32& b)
1370 __m256i t0 = __lasx_xvabsd_b(a.val, b.val);
1371 __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
1372 __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1373 __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1374 __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1375 return (
unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1377inline unsigned v_reduce_sad(
const v_uint16x16& a,
const v_uint16x16& b)
1380 v_expand(v_add_wrap(a - b, b - a), l, h);
1383inline unsigned v_reduce_sad(
const v_int16x16& a,
const v_int16x16& b)
1386 v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
1389inline unsigned v_reduce_sad(
const v_uint32x8& a,
const v_uint32x8& b)
1393inline unsigned v_reduce_sad(
const v_int32x8& a,
const v_int32x8& b)
1395 v_int32x8 m = a < b;
1396 return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
1398inline float v_reduce_sad(
const v_float32x8& a,
const v_float32x8& b)
1400 v_float32x8 a_b = a - b;
1401 return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
1405inline v_uint8x32
v_popcount(
const v_uint8x32& a)
1406{
return v_uint8x32(__lasx_xvpcnt_b(a.val)); }
1407inline v_uint16x16
v_popcount(
const v_uint16x16& a)
1408{
return v_uint16x16(__lasx_xvpcnt_h(a.val)); }
1409inline v_uint32x8
v_popcount(
const v_uint32x8& a)
1410{
return v_uint32x8(__lasx_xvpcnt_w(a.val)); }
1411inline v_uint64x4
v_popcount(
const v_uint64x4& a)
1412{
return v_uint64x4(__lasx_xvpcnt_d(a.val)); }
1413inline v_uint8x32
v_popcount(
const v_int8x32& a)
1414{
return v_popcount(v_reinterpret_as_u8(a)); }
1415inline v_uint16x16
v_popcount(
const v_int16x16& a)
1416{
return v_popcount(v_reinterpret_as_u16(a)); }
1417inline v_uint32x8
v_popcount(
const v_int32x8& a)
1418{
return v_popcount(v_reinterpret_as_u32(a)); }
1419inline v_uint64x4
v_popcount(
const v_int64x4& a)
1420{
return v_popcount(v_reinterpret_as_u64(a)); }
1424 __m256i
result = __lasx_xvmskltz_b(a.val);
1426 mask |= (__lasx_xvpickve2gr_w(
result, 4) << 16);
1430{
return v_signmask(v_reinterpret_as_s8(a)); }
1435{
return v_signmask(v_reinterpret_as_s16(a)); }
1439 __m256i
result = __lasx_xvmskltz_w(a.val);
1441 mask |= (__lasx_xvpickve2gr_w(
result, 4) << 4);
1449 __m256i
result = __lasx_xvmskltz_d(a.val);
1451 mask |= (__lasx_xvpickve2gr_w(
result, 4) << 2);
1455{
return v_signmask(v_reinterpret_as_s64(a)); }
1475#define OPENCV_HAL_IMPL_LASX_CHECK(_Tpvec, allmask) \
1476 inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1477 inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1478OPENCV_HAL_IMPL_LASX_CHECK(v_uint8x32, -1)
1479OPENCV_HAL_IMPL_LASX_CHECK(v_int8x32, -1)
1480OPENCV_HAL_IMPL_LASX_CHECK(v_uint32x8, 255)
1481OPENCV_HAL_IMPL_LASX_CHECK(v_int32x8, 255)
1482OPENCV_HAL_IMPL_LASX_CHECK(v_uint64x4, 15)
1483OPENCV_HAL_IMPL_LASX_CHECK(v_int64x4, 15)
1484OPENCV_HAL_IMPL_LASX_CHECK(v_float32x8, 255)
1485OPENCV_HAL_IMPL_LASX_CHECK(v_float64x4, 15)
1487#define OPENCV_HAL_IMPL_LASX_CHECK_SHORT(_Tpvec) \
1488 inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
1489 inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
1490OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_uint16x16)
1491OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
1496#define OPENCV_HAL_IMPL_LASX_MULADD(_Tpvec, suffix) \
1497 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1498 { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); } \
1499 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1500 { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); } \
1501 inline _Tpvec v_sqrt(const _Tpvec& x) \
1502 { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); } \
1503 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1504 { return v_fma(a, a, b * b); } \
1505 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1506 { return v_sqrt(v_fma(a, a, b*b)); }
1508OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
1509OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
1511inline v_int32x8
v_fma(
const v_int32x8& a,
const v_int32x8& b,
const v_int32x8& c)
1513 return v_int32x8(__lasx_xvmadd_w(c.val, a.val, b.val));
1516inline v_int32x8
v_muladd(
const v_int32x8& a,
const v_int32x8& b,
const v_int32x8& c)
1518 return v_fma(a, b, c);
1521inline v_float32x8
v_invsqrt(
const v_float32x8&
x)
1522{
return v_float32x8(__lasx_xvfrsqrt_s(
x.val)); }
1524inline v_float64x4
v_invsqrt(
const v_float64x4&
x)
1525{
return v_float64x4(__lasx_xvfrsqrt_d(
x.val)); }
1528#define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix) \
1529 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1530 { return v_u##_Tpvec(__lasx_xvabsd_##suffix(x.val, __lasx_xvreplgr2vr_w(0))); }
1532OPENCV_HAL_IMPL_LASX_ABS(int8x32, b)
1533OPENCV_HAL_IMPL_LASX_ABS(int16x16, h)
1534OPENCV_HAL_IMPL_LASX_ABS(int32x8, w)
1536inline v_float32x8 v_abs(
const v_float32x8&
x)
1537{
return v_float32x8(*((__m256i*)&
x) & __lasx_xvreplgr2vr_w(0x7fffffff)); }
1538inline v_float64x4 v_abs(
const v_float64x4&
x)
1539{
return v_float64x4(*((__m256i*)&
x) & __lasx_xvreplgr2vr_d(0x7fffffffffffffff)); }
1542inline v_uint8x32
v_absdiff(
const v_uint8x32& a,
const v_uint8x32& b)
1543{
return (v_uint8x32)__lasx_xvabsd_bu(a.val, b.val); }
1544inline v_uint16x16
v_absdiff(
const v_uint16x16& a,
const v_uint16x16& b)
1545{
return (v_uint16x16)__lasx_xvabsd_hu(a.val, b.val); }
1546inline v_uint32x8
v_absdiff(
const v_uint32x8& a,
const v_uint32x8& b)
1547{
return (v_uint32x8)__lasx_xvabsd_wu(a.val, b.val); }
1549inline v_uint8x32
v_absdiff(
const v_int8x32& a,
const v_int8x32& b)
1550{
return (v_uint8x32)__lasx_xvabsd_b(a.val, b.val); }
1551inline v_uint16x16
v_absdiff(
const v_int16x16& a,
const v_int16x16& b)
1552{
return (v_uint16x16)__lasx_xvabsd_h(a.val, b.val); }
1553inline v_uint32x8
v_absdiff(
const v_int32x8& a,
const v_int32x8& b)
1554{
return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
1556inline v_float32x8
v_absdiff(
const v_float32x8& a,
const v_float32x8& b)
1557{
return v_abs(a - b); }
1559inline v_float64x4
v_absdiff(
const v_float64x4& a,
const v_float64x4& b)
1560{
return v_abs(a - b); }
1563inline v_int8x32
v_absdiffs(
const v_int8x32& a,
const v_int8x32& b)
1565 v_int8x32 d = a - b;
1566 v_int8x32 m = a < b;
1569inline v_int16x16
v_absdiffs(
const v_int16x16& a,
const v_int16x16& b)
1570{
return v_max(a, b) - v_min(a, b); }
1575inline v_int32x8
v_round(
const v_float32x8& a)
1576{
return v_int32x8(__lasx_xvftint_w_s(a.val)); }
1578inline v_int32x8
v_round(
const v_float64x4& a)
1579{ __m256i t = __lasx_xvftint_w_d(a.val, a.val);
1580 return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
1582inline v_int32x8
v_round(
const v_float64x4& a,
const v_float64x4& b)
1584 __m256i abi = __lasx_xvftint_w_d(b.val, a.val);
1585 return v_int32x8(__lasx_xvpermi_d(abi, 0b11011000));
1588inline v_int32x8
v_trunc(
const v_float32x8& a)
1589{
return v_int32x8(__lasx_xvftintrz_w_s(a.val)); }
1591inline v_int32x8
v_trunc(
const v_float64x4& a)
1592{ __m256i t = __lasx_xvftintrz_w_d(a.val, a.val);
1593 return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
1595inline v_int32x8
v_floor(
const v_float32x8& a)
1596{
return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrm_s(a.val)))); }
1598inline v_int32x8
v_floor(
const v_float64x4& a)
1599{
return v_trunc(v_float64x4(__lasx_xvfrintrm_d(a.val))); }
1601inline v_int32x8
v_ceil(
const v_float32x8& a)
1602{
return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrp_s(a.val)))); }
1604inline v_int32x8
v_ceil(
const v_float64x4& a)
1605{
return v_trunc(v_float64x4(__lasx_xvfrintrp_d(a.val))); }
1608inline v_float32x8
v_cvt_f32(
const v_int32x8& a)
1609{
return v_float32x8(__lasx_xvffint_s_w(a.val)); }
1611inline v_float32x8
v_cvt_f32(
const v_float64x4& a)
1612{
return v_float32x8(__lasx_xvpermi_d(__lasx_xvfcvt_s_d(a.val, a.val), 0x88)); }
1614inline v_float32x8
v_cvt_f32(
const v_float64x4& a,
const v_float64x4& b)
1616 __m256 abf = __lasx_xvfcvt_s_d(a.val, b.val);
1617 return v_float32x8(__lasx_xvpermi_d(abf, 0x8D));
1620inline v_float64x4
v_cvt_f64(
const v_int32x8& a)
1622 __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
1623 return v_float64x4(__lasx_xvffintl_d_w(alow));
1628 __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
1629 return v_float64x4(__lasx_xvffintl_d_w(ahigh));
1632inline v_float64x4
v_cvt_f64(
const v_float32x8& a)
1634 __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
1635 return v_float64x4(__lasx_xvfcvtl_d_s((__m256)alow));
1640 __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
1641 return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh));
1644inline v_float64x4
v_cvt_f64(
const v_int64x4& v)
1645{
return v_float64x4(__lasx_xvffint_d_l(v.val)); }
1649inline v_int8x32 v256_lut(
const schar* tab,
const int*
idx)
1651 return v_int8x32(_v256_setr_b(tab[
idx[ 0]], tab[
idx[ 1]], tab[
idx[ 2]], tab[
idx[ 3]], tab[
idx[ 4]], tab[
idx[ 5]],
1656 tab[
idx[30]], tab[
idx[31]]));
1658inline v_int8x32 v256_lut_pairs(
const schar* tab,
const int*
idx)
1660 return v_int8x32(_v256_setr_h(*(
const short*)(tab +
idx[ 0]), *(
const short*)(tab +
idx[ 1]), *(
const short*)(tab +
idx[ 2]),
1661 *(
const short*)(tab +
idx[ 3]), *(
const short*)(tab +
idx[ 4]), *(
const short*)(tab +
idx[ 5]),
1662 *(
const short*)(tab +
idx[ 6]), *(
const short*)(tab +
idx[ 7]), *(
const short*)(tab +
idx[ 8]),
1663 *(
const short*)(tab +
idx[ 9]), *(
const short*)(tab +
idx[10]), *(
const short*)(tab +
idx[11]),
1664 *(
const short*)(tab +
idx[12]), *(
const short*)(tab +
idx[13]), *(
const short*)(tab +
idx[14]),
1665 *(
const short*)(tab +
idx[15])));
1667inline v_int8x32 v256_lut_quads(
const schar* tab,
const int*
idx)
1669 return v_int8x32(_v256_setr_w(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
1670 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3]),
1671 *(
const int*)(tab +
idx[4]), *(
const int*)(tab +
idx[5]),
1672 *(
const int*)(tab +
idx[6]), *(
const int*)(tab +
idx[7])));
1674inline v_uint8x32 v256_lut(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v256_lut((
const schar *)tab,
idx)); }
1675inline v_uint8x32 v256_lut_pairs(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v256_lut_pairs((
const schar *)tab,
idx)); }
1676inline v_uint8x32 v256_lut_quads(
const uchar* tab,
const int*
idx) {
return v_reinterpret_as_u8(v256_lut_quads((
const schar *)tab,
idx)); }
1678inline v_int16x16 v256_lut(
const short* tab,
const int*
idx)
1680 return v_int16x16(_v256_setr_h(tab[
idx[ 0]], tab[
idx[ 1]], tab[
idx[ 2]], tab[
idx[ 3]], tab[
idx[ 4]],
1685inline v_int16x16 v256_lut_pairs(
const short* tab,
const int*
idx)
1687 return v_int16x16(_v256_setr_w(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
1688 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3]),
1689 *(
const int*)(tab +
idx[4]), *(
const int*)(tab +
idx[5]),
1690 *(
const int*)(tab +
idx[6]), *(
const int*)(tab +
idx[7]) ));
1692inline v_int16x16 v256_lut_quads(
const short* tab,
const int*
idx)
1694 return v_int16x16(_v256_setr_d(*(
const long long int*)(tab +
idx[0]), *(
const long long int*)(tab +
idx[1]),
1695 *(
const long long int*)(tab +
idx[2]), *(
const long long int*)(tab +
idx[3]) ));
1698inline v_uint16x16 v256_lut(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v256_lut((
const short *)tab,
idx)); }
1699inline v_uint16x16 v256_lut_pairs(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v256_lut_pairs((
const short *)tab,
idx)); }
1700inline v_uint16x16 v256_lut_quads(
const ushort* tab,
const int*
idx) {
return v_reinterpret_as_u16(v256_lut_quads((
const short *)tab,
idx)); }
1702inline v_int32x8 v256_lut(
const int* tab,
const int*
idx)
1704 return v_int32x8(_v256_setr_w(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
1705 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3]),
1706 *(
const int*)(tab +
idx[4]), *(
const int*)(tab +
idx[5]),
1707 *(
const int*)(tab +
idx[6]), *(
const int*)(tab +
idx[7]) ));
1709inline v_int32x8 v256_lut_pairs(
const int* tab,
const int*
idx)
1711 return v_int32x8(_v256_setr_d(*(
const long long int*)(tab +
idx[0]), *(
const long long int*)(tab +
idx[1]),
1712 *(
const long long int*)(tab +
idx[2]), *(
const long long int*)(tab +
idx[3]) ));
1714inline v_int32x8 v256_lut_quads(
const int* tab,
const int*
idx)
1716 return v_int32x8(_v256_combine(__lsx_vld(tab +
idx[0], 0), __lsx_vld(tab +
idx[1], 0)));
1718inline v_uint32x8 v256_lut(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v256_lut((
const int *)tab,
idx)); }
1719inline v_uint32x8 v256_lut_pairs(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v256_lut_pairs((
const int *)tab,
idx)); }
1720inline v_uint32x8 v256_lut_quads(
const unsigned* tab,
const int*
idx) {
return v_reinterpret_as_u32(v256_lut_quads((
const int *)tab,
idx)); }
1722inline v_int64x4 v256_lut(
const int64* tab,
const int*
idx)
1724 return v_int64x4(_v256_setr_d(*(
const long long int*)(tab +
idx[0]), *(
const long long int*)(tab +
idx[1]),
1725 *(
const long long int*)(tab +
idx[2]), *(
const long long int*)(tab +
idx[3]) ));
1727inline v_int64x4 v256_lut_pairs(
const int64* tab,
const int*
idx)
1729 return v_int64x4(_v256_combine(__lsx_vld(tab +
idx[0], 0), __lsx_vld(tab +
idx[1], 0)));
1731inline v_uint64x4 v256_lut(
const uint64* tab,
const int*
idx) {
return v_reinterpret_as_u64(v256_lut((
const int64 *)tab,
idx)); }
1732inline v_uint64x4 v256_lut_pairs(
const uint64* tab,
const int*
idx) {
return v_reinterpret_as_u64(v256_lut_pairs((
const int64 *)tab,
idx)); }
1734inline v_float32x8 v256_lut(
const float* tab,
const int*
idx)
1736 return v_float32x8(_v256_setr_ps(tab[
idx[0]], tab[
idx[1]], tab[
idx[2]], tab[
idx[3]],
1739inline v_float32x8 v256_lut_pairs(
const float* tab,
const int*
idx) {
return v_reinterpret_as_f32(v256_lut_pairs((
const int *)tab,
idx)); }
1740inline v_float32x8 v256_lut_quads(
const float* tab,
const int*
idx) {
return v_reinterpret_as_f32(v256_lut_quads((
const int *)tab,
idx)); }
1742inline v_float64x4 v256_lut(
const double* tab,
const int*
idx)
1744 return v_float64x4(_v256_setr_pd(tab[
idx[0]], tab[
idx[1]], tab[
idx[2]], tab[
idx[3]]));
1746inline v_float64x4 v256_lut_pairs(
const double* tab,
const int*
idx)
1747{
return v_float64x4(_v256_combine(__lsx_vld(tab +
idx[0], 0), __lsx_vld(tab +
idx[1], 0))); }
1749inline v_int32x8
v_lut(
const int* tab,
const v_int32x8& idxvec)
1751 int *
idx = (
int*)&idxvec.val;
1752 return v256_lut(tab,
idx);
1755inline v_uint32x8
v_lut(
const unsigned* tab,
const v_int32x8& idxvec)
1757 return v_reinterpret_as_u32(
v_lut((
const int *)tab, idxvec));
1760inline v_float32x8
v_lut(
const float* tab,
const v_int32x8& idxvec)
1762 const int *
idx = (
const int*)&idxvec.val;
1763 return v256_lut(tab,
idx);
1766inline v_float64x4
v_lut(
const double* tab,
const v_int32x8& idxvec)
1768 const int *
idx = (
const int*)&idxvec.val;
1769 return v256_lut(tab,
idx);
1772inline void v_lut_deinterleave(
const float* tab,
const v_int32x8& idxvec, v_float32x8&
x, v_float32x8&
y)
1774 const int *
idx = (
const int*)&idxvec.val;
1775 __m128i xy01, xy45, xy23, xy67;
1776 xy01 = __lsx_vld(tab +
idx[0], 0);
1777 xy01 = __lsx_vextrins_d(xy01, __lsx_vld(tab +
idx[1], 0), 0x10);
1778 xy45 = __lsx_vld(tab +
idx[4], 0);
1779 xy45 = __lsx_vextrins_d(xy45, __lsx_vld(tab +
idx[5], 0), 0x10);
1780 __m256i xy0145 = _v256_combine(xy01, xy45);
1781 xy23 = __lsx_vld(tab +
idx[2], 0);
1782 xy23 = __lsx_vextrins_d(xy23, __lsx_vld(tab +
idx[3], 0), 0x10);
1783 xy67 = __lsx_vld(tab +
idx[6], 0);
1784 xy67 = __lsx_vextrins_d(xy67, __lsx_vld(tab +
idx[7], 0), 0x10);
1785 __m256i xy2367 = _v256_combine(xy23, xy67);
1787 __m256i xxyy0145 = __lasx_xvilvl_w(xy2367, xy0145);
1788 __m256i xxyy2367 = __lasx_xvilvh_w(xy2367, xy0145);
1790 x = v_float32x8(__lasx_xvilvl_w(xxyy2367, xxyy0145));
1791 y = v_float32x8(__lasx_xvilvh_w(xxyy2367, xxyy0145));
1794inline void v_lut_deinterleave(
const double* tab,
const v_int32x8& idxvec, v_float64x4&
x, v_float64x4&
y)
1797 const int *
idx = (
const int*)&idxvec.val;
1798 __m128i xy0 = __lsx_vld(tab +
idx[0], 0);
1799 __m128i xy2 = __lsx_vld(tab +
idx[2], 0);
1800 __m128i xy1 = __lsx_vld(tab +
idx[1], 0);
1801 __m128i xy3 = __lsx_vld(tab +
idx[3], 0);
1802 __m256i xy02 = _v256_combine(xy0, xy2);
1803 __m256i xy13 = _v256_combine(xy1, xy3);
1805 x = v_float64x4(__lasx_xvilvl_d(xy13, xy02));
1806 y = v_float64x4(__lasx_xvilvh_d(xy13, xy02));
1811 return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
1812 _v256_set_d(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
1818 return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
1819 _v256_set_d(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
1826 return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
1827 _v256_set_d(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
1833 return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
1834 _v256_set_d(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
1841 return v_int32x8(__lasx_xvshuf4i_w(vec.val, 0xd8));
1850 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1851 __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
1852 _v256_set_d(0x1211100f0e0d0c0a, 0x0908060504020100, 0x1211100f0e0d0c0a, 0x0908060504020100));
1853 return v_int8x32(__lasx_xvperm_w(t1,
1854 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1857{
return v_reinterpret_as_u8(
v_pack_triplets(v_reinterpret_as_s8(vec))); }
1861 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1862 __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
1863 _v256_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100, 0x11100f0e0d0c0b0a, 0x0908050403020100));
1864 return v_int16x16(__lasx_xvperm_w(t1,
1865 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1868{
return v_reinterpret_as_u16(
v_pack_triplets(v_reinterpret_as_s16(vec))); }
1872 return v_int32x8(__lasx_xvperm_w(vec.val,
1873 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1876{
return v_reinterpret_as_u32(
v_pack_triplets(v_reinterpret_as_s32(vec))); }
1879 return v_float32x8(__lasx_xvperm_w(*(__m256i*)(&vec.val),
1880 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1888inline v_int32x8
v_dotprod(
const v_int16x16& a,
const v_int16x16& b)
1889{
return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
1891inline v_int32x8
v_dotprod(
const v_int16x16& a,
const v_int16x16& b,
const v_int32x8& c)
1895inline v_int64x4
v_dotprod(
const v_int32x8& a,
const v_int32x8& b)
1897 __m256i even = __lasx_xvmulwev_d_w(a.val, b.val);
1898 return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
1900inline v_int64x4
v_dotprod(
const v_int32x8& a,
const v_int32x8& b,
const v_int64x4& c)
1902 __m256i even = __lasx_xvmaddwev_d_w(c.val, a.val, b.val);
1903 return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
1907inline v_uint32x8
v_dotprod_expand(
const v_uint8x32& a,
const v_uint8x32& b)
1909 __m256i even = __lasx_xvmulwev_h_bu(a.val, b.val);
1910 __m256i odd = __lasx_xvmulwod_h_bu(a.val, b.val);
1911 __m256i prod0 = __lasx_xvhaddw_wu_hu(even, even);
1912 __m256i prod1 = __lasx_xvhaddw_wu_hu(odd, odd);
1913 return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
1915inline v_uint32x8
v_dotprod_expand(
const v_uint8x32& a,
const v_uint8x32& b,
const v_uint32x8& c)
1920 __m256i even = __lasx_xvmulwev_h_b(a.val, b.val);
1921 __m256i odd = __lasx_xvmulwod_h_b(a.val, b.val);
1922 __m256i prod0 = __lasx_xvhaddw_w_h(even, even);
1923 __m256i prod1 = __lasx_xvhaddw_w_h(odd, odd);
1924 return v_int32x8(__lasx_xvadd_w(prod0, prod1));
1926inline v_int32x8
v_dotprod_expand(
const v_int8x32& a,
const v_int8x32& b,
const v_int32x8& c)
1930inline v_uint64x4
v_dotprod_expand(
const v_uint16x16& a,
const v_uint16x16& b)
1932 __m256i even = __lasx_xvmulwev_w_hu(a.val, b.val);
1933 __m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val);
1934 __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
1935 __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
1936 return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
1938inline v_uint64x4
v_dotprod_expand(
const v_uint16x16& a,
const v_uint16x16& b,
const v_uint64x4& c)
1943 __m256i even = __lasx_xvmulwev_w_h(a.val, b.val);
1944 __m256i odd = __lasx_xvmulwod_w_h(a.val, b.val);
1945 __m256i prod0 = __lasx_xvhaddw_d_w(even, even);
1946 __m256i prod1 = __lasx_xvhaddw_d_w(odd, odd);
1947 return v_int64x4(__lasx_xvadd_d(prod0, prod1));
1950inline v_int64x4
v_dotprod_expand(
const v_int16x16& a,
const v_int16x16& b,
const v_int64x4& c)
1956inline v_float64x4
v_dotprod_expand(
const v_int32x8& a,
const v_int32x8& b,
const v_float64x4& c)
1962inline v_int32x8
v_dotprod_fast(
const v_int16x16& a,
const v_int16x16& b)
1964inline v_int32x8
v_dotprod_fast(
const v_int16x16& a,
const v_int16x16& b,
const v_int32x8& c)
1968inline v_int64x4
v_dotprod_fast(
const v_int32x8& a,
const v_int32x8& b)
1970inline v_int64x4
v_dotprod_fast(
const v_int32x8& a,
const v_int32x8& b,
const v_int64x4& c)
1976inline v_uint32x8
v_dotprod_expand_fast(
const v_uint8x32& a,
const v_uint8x32& b,
const v_uint32x8& c)
1987 __m256i even = __lasx_xvmulwev_w_hu(a.val, b.val);
1988 __m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val);
1989 __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
1990 __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
1991 return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
1993inline v_uint64x4
v_dotprod_expand_fast(
const v_uint16x16& a,
const v_uint16x16& b,
const v_uint64x4& c)
1998 __m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
1999 __m256i sign = __lasx_xvsrai_w(prod, 31);
2000 __m256i lo = __lasx_xvilvl_w(sign, prod);
2001 __m256i hi = __lasx_xvilvh_w(sign, prod);
2002 return v_int64x4(__lasx_xvadd_d(lo, hi));
2010inline v_float64x4
v_dotprod_expand_fast(
const v_int32x8& a,
const v_int32x8& b,
const v_float64x4& c)
2014#define OPENCV_HAL_LASX_SPLAT2_PS(a, im) \
2015 v_float32x8(__lasx_xvpermi_w(a.val, a.val, im))
2017inline v_float32x8
v_matmul(
const v_float32x8& v,
const v_float32x8& m0,
2018 const v_float32x8& m1,
const v_float32x8& m2,
2019 const v_float32x8& m3)
2021 v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
2022 v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
2023 v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
2024 v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
2028inline v_float32x8
v_matmuladd(
const v_float32x8& v,
const v_float32x8& m0,
2029 const v_float32x8& m1,
const v_float32x8& m2,
2030 const v_float32x8& a)
2032 v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
2033 v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
2034 v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
2039#define OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(_Tpvec, cast_from, cast_to) \
2040 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2041 const _Tpvec& a2, const _Tpvec& a3, \
2042 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
2044 __m256i t0 = cast_from(__lasx_xvilvl_w(a1.val, a0.val)); \
2045 __m256i t1 = cast_from(__lasx_xvilvl_w(a3.val, a2.val)); \
2046 __m256i t2 = cast_from(__lasx_xvilvh_w(a1.val, a0.val)); \
2047 __m256i t3 = cast_from(__lasx_xvilvh_w(a3.val, a2.val)); \
2048 b0.val = cast_to(__lasx_xvilvl_d(t1, t0)); \
2049 b1.val = cast_to(__lasx_xvilvh_d(t1, t0)); \
2050 b2.val = cast_to(__lasx_xvilvl_d(t3, t2)); \
2051 b3.val = cast_to(__lasx_xvilvh_d(t3, t2)); \
2054OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_uint32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2055OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_int32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2057inline void v_transpose4x4(
const v_float32x8 &a0,
const v_float32x8 &a1,
2058 const v_float32x8 &a2,
const v_float32x8 &a3,
2059 v_float32x8 &b0, v_float32x8 &b1, v_float32x8 &b2, v_float32x8 &b3)
2061 __m256i t0 = __lasx_xvilvl_w(__m256i(a1.val), __m256i(a0.val));
2062 __m256i t1 = __lasx_xvilvl_w(__m256i(a3.val), __m256i(a2.val));
2063 __m256i t2 = __lasx_xvilvh_w(__m256i(a1.val), __m256i(a0.val));
2064 __m256i t3 = __lasx_xvilvh_w(__m256i(a3.val), __m256i(a2.val));
2065 b0.val = __m256(__lasx_xvilvl_d(t1, t0));
2066 b1.val = __m256(__lasx_xvilvh_d(t1, t0));
2067 b2.val = __m256(__lasx_xvilvl_d(t3, t2));
2068 b3.val = __m256(__lasx_xvilvh_d(t3, t2));
2074#define OPENCV_HAL_IMPL_LASX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
2075 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2077 b0.val = intrin(a.val); \
2078 b1.val = intrin(__lasx_xvpermi_q(a.val, a.val, 0x11)); \
2080 inline _Tpwvec v_expand_low(const _Tpvec& a) \
2081 { return _Tpwvec(intrin(a.val)); } \
2082 inline _Tpwvec v_expand_high(const _Tpvec& a) \
2083 { return _Tpwvec(intrin(__lasx_xvpermi_q(a.val, a.val, 0x11))); } \
2084 inline _Tpwvec v256_load_expand(const _Tp* ptr) \
2086 __m128i a = __lsx_vld(ptr, 0); \
2087 return _Tpwvec(intrin(*((__m256i*)&a))); \
2090OPENCV_HAL_IMPL_LASX_EXPAND(v_uint8x32, v_uint16x16,
uchar, __lasx_vext2xv_hu_bu)
2091OPENCV_HAL_IMPL_LASX_EXPAND(v_int8x32, v_int16x16,
schar, __lasx_vext2xv_h_b)
2092OPENCV_HAL_IMPL_LASX_EXPAND(v_uint16x16, v_uint32x8,
ushort, __lasx_vext2xv_wu_hu)
2093OPENCV_HAL_IMPL_LASX_EXPAND(v_int16x16, v_int32x8,
short, __lasx_vext2xv_w_h)
2094OPENCV_HAL_IMPL_LASX_EXPAND(v_uint32x8, v_uint64x4,
unsigned, __lasx_vext2xv_du_wu)
2095OPENCV_HAL_IMPL_LASX_EXPAND(v_int32x8, v_int64x4,
int, __lasx_vext2xv_d_w)
2097#define OPENCV_HAL_IMPL_LASX_EXPAND_Q(_Tpvec, _Tp, intrin) \
2098 inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
2100 __m128i a = __lsx_vld(ptr, 0); \
2101 return _Tpvec(intrin(*((__m256i*)&a))); \
2104OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_uint32x8,
uchar, __lasx_vext2xv_wu_bu)
2105OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_int32x8,
schar, __lasx_vext2xv_w_b)
2109inline v_int8x32 v_pack(
const v_int16x16& a,
const v_int16x16& b)
2110{
return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); }
2112inline v_uint8x32 v_pack(
const v_uint16x16& a,
const v_uint16x16& b)
2113{
return v_uint8x32(_v256_shuffle_odd_64(__lasx_xvssrlrni_bu_h(b.val, a.val, 0))); }
2115inline v_uint8x32 v_pack_u(
const v_int16x16& a,
const v_int16x16& b)
2117 return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a.val, b.val)));
2126inline void v_pack_u_store(
uchar* ptr,
const v_int16x16& a)
2129template<
int n>
inline
2130v_uint8x32 v_rshr_pack(
const v_uint16x16& a,
const v_uint16x16& b)
2132 __m256i res = __lasx_xvssrlrni_bu_h(b.val, a.val, n);
2133 return v_uint8x32(_v256_shuffle_odd_64(res));
2136template<
int n>
inline
2137void v_rshr_pack_store(
uchar* ptr,
const v_uint16x16& a)
2139 __m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
2140 __lasx_xvstelm_d(res, ptr, 0, 0);
2141 __lasx_xvstelm_d(res, ptr, 8, 2);
2144template<
int n>
inline
2145v_uint8x32 v_rshr_pack_u(
const v_int16x16& a,
const v_int16x16& b)
2147 __m256i res = __lasx_xvssrarni_bu_h(b.val, a.val, n);
2148 return v_uint8x32(_v256_shuffle_odd_64(res));
2151template<
int n>
inline
2152void v_rshr_pack_u_store(
uchar* ptr,
const v_int16x16& a)
2154 __m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
2155 __lasx_xvstelm_d(res, ptr, 0, 0);
2156 __lasx_xvstelm_d(res, ptr, 8, 2);
2159template<
int n>
inline
2160v_int8x32 v_rshr_pack(
const v_int16x16& a,
const v_int16x16& b)
2162 __m256i res = __lasx_xvssrarni_b_h(b.val, a.val, n);
2163 return v_int8x32(_v256_shuffle_odd_64(res));
2166template<
int n>
inline
2167void v_rshr_pack_store(
schar* ptr,
const v_int16x16& a)
2169 __m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
2170 __lasx_xvstelm_d(res, ptr, 0, 0);
2171 __lasx_xvstelm_d(res, ptr, 8, 2);
2175inline v_int16x16 v_pack(
const v_int32x8& a,
const v_int32x8& b)
2176{
return v_int16x16(_v256_shuffle_odd_64(_lasx_packs_w(a.val, b.val))); }
2178inline v_uint16x16 v_pack(
const v_uint32x8& a,
const v_uint32x8& b)
2179{
return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
2181inline v_uint16x16 v_pack_u(
const v_int32x8& a,
const v_int32x8& b)
2182{
return v_uint16x16(_v256_shuffle_odd_64(_lasx_packus_w(a.val, b.val))); }
2184inline void v_pack_store(
short* ptr,
const v_int32x8& a)
2189 __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
2190 __lasx_xvstelm_d(res, ptr, 0, 0);
2191 __lasx_xvstelm_d(res, ptr, 8, 2);
2194inline void v_pack_u_store(
ushort* ptr,
const v_int32x8& a)
2197template<
int n>
inline
2198v_uint16x16 v_rshr_pack(
const v_uint32x8& a,
const v_uint32x8& b)
2199{
return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrlrni_hu_w(b.val, a.val, n))); }
2201template<
int n>
inline
2202void v_rshr_pack_store(
ushort* ptr,
const v_uint32x8& a)
2204 __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
2205 __lasx_xvstelm_d(res, ptr, 0, 0);
2206 __lasx_xvstelm_d(res, ptr, 8, 2);
2209template<
int n>
inline
2210v_uint16x16 v_rshr_pack_u(
const v_int32x8& a,
const v_int32x8& b)
2211{
return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_hu_w(b.val, a.val, n))); }
2213template<
int n>
inline
2214void v_rshr_pack_u_store(
ushort* ptr,
const v_int32x8& a)
2216 __m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
2217 __lasx_xvstelm_d(res, ptr, 0, 0);
2218 __lasx_xvstelm_d(res, ptr, 8, 2);
2221template<
int n>
inline
2222v_int16x16 v_rshr_pack(
const v_int32x8& a,
const v_int32x8& b)
2223{
return v_int16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_h_w(b.val, a.val, n))); }
2225template<
int n>
inline
2226void v_rshr_pack_store(
short* ptr,
const v_int32x8& a)
2228 __m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
2229 __lasx_xvstelm_d(res, ptr, 0, 0);
2230 __lasx_xvstelm_d(res, ptr, 8, 2);
2235inline v_uint32x8 v_pack(
const v_uint64x4& a,
const v_uint64x4& b)
2237 __m256i ab = __lasx_xvpickev_w(b.val, a.val);
2238 return v_uint32x8(_v256_shuffle_odd_64(ab));
2241inline v_int32x8 v_pack(
const v_int64x4& a,
const v_int64x4& b)
2242{
return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2244inline void v_pack_store(
unsigned* ptr,
const v_uint64x4& a)
2246 __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
2247 v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
2251{
v_pack_store((
unsigned*)ptr, v_reinterpret_as_u64(b)); }
2253template<
int n>
inline
2254v_uint32x8 v_rshr_pack(
const v_uint64x4& a,
const v_uint64x4& b)
2255{
return v_uint32x8(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(b.val, a.val, n))); }
2257template<
int n>
inline
2258void v_rshr_pack_store(
unsigned* ptr,
const v_uint64x4& a)
2260 __m256i res = __lasx_xvsrlrni_w_d(a.val, a.val, n);
2261 __lasx_xvstelm_d(res, ptr, 0, 0);
2262 __lasx_xvstelm_d(res, ptr, 8, 2);
2265template<
int n>
inline
2266v_int32x8 v_rshr_pack(
const v_int64x4& a,
const v_int64x4& b)
2267{
return v_int32x8(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(b.val, a.val, n))); }
2269template<
int n>
inline
2270void v_rshr_pack_store(
int* ptr,
const v_int64x4& a)
2272 __m256i res = __lasx_xvsrarni_w_d(a.val, a.val, n);
2273 __lasx_xvstelm_d(res, ptr, 0, 0);
2274 __lasx_xvstelm_d(res, ptr, 8, 2);
2278inline v_uint8x32
v_pack_b(
const v_uint16x16& a,
const v_uint16x16& b)
2280 __m256i ab = _lasx_packs_h(a.val, b.val);
2281 return v_uint8x32(_v256_shuffle_odd_64(ab));
2284inline v_uint8x32
v_pack_b(
const v_uint32x8& a,
const v_uint32x8& b,
2285 const v_uint32x8& c,
const v_uint32x8& d)
2287 __m256i ab = _lasx_packs_w(a.val, b.val);
2288 __m256i cd = _lasx_packs_w(c.val, d.val);
2290 __m256i abcd = _v256_shuffle_odd_64(_lasx_packs_h(ab, cd));
2291 return v_uint8x32(__lasx_xvshuf4i_w(abcd, 0xd8));
2294inline v_uint8x32
v_pack_b(
const v_uint64x4& a,
const v_uint64x4& b,
const v_uint64x4& c,
2295 const v_uint64x4& d,
const v_uint64x4& e,
const v_uint64x4& f,
2296 const v_uint64x4& g,
const v_uint64x4& h)
2298 __m256i ab = _lasx_packs_w(a.val, b.val);
2299 __m256i cd = _lasx_packs_w(c.val, d.val);
2300 __m256i ef = _lasx_packs_w(e.val, f.val);
2301 __m256i gh = _lasx_packs_w(g.val, h.val);
2303 __m256i abcd = _lasx_packs_w(ab, cd);
2304 __m256i efgh = _lasx_packs_w(ef, gh);
2305 __m256i pkall = _v256_shuffle_odd_64(_lasx_packs_h(abcd, efgh));
2307 __m256i rev = _v256_alignr_b(pkall, pkall, 8);
2308 return v_uint8x32(__lasx_xvilvl_h(rev, pkall));
2315#define OPENCV_HAL_IMPL_LASX_EXTRACT(_Tpvec) \
2317 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2318 { return v_rotate_right<s>(a, b); }
2320OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint8x32)
2321OPENCV_HAL_IMPL_LASX_EXTRACT(v_int8x32)
2322OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint16x16)
2323OPENCV_HAL_IMPL_LASX_EXTRACT(v_int16x16)
2324OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint32x8)
2325OPENCV_HAL_IMPL_LASX_EXTRACT(v_int32x8)
2326OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint64x4)
2327OPENCV_HAL_IMPL_LASX_EXTRACT(v_int64x4)
2328OPENCV_HAL_IMPL_LASX_EXTRACT(v_float32x8)
2329OPENCV_HAL_IMPL_LASX_EXTRACT(v_float64x4)
2334 return (
uchar)_v256_extract_b<i>(a.val);
2340 return (
schar)v_extract_n<i>(v_reinterpret_as_u8(a));
2346 return (
ushort)_v256_extract_h<i>(a.val);
2352 return (
short)v_extract_n<i>(v_reinterpret_as_u16(a));
2358 return (
uint)_v256_extract_w<i>(a.val);
2364 return (
int)v_extract_n<i>(v_reinterpret_as_u32(a));
2370 return (
uint64)_v256_extract_d<i>(a.val);
2376 return (
int64)v_extract_n<i>(v_reinterpret_as_u64(v));
2382 union {
uint iv;
float fv; } d;
2383 d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
2390 union {
uint64 iv;
double dv; } d;
2391 d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
2398 static const __m256i perm = __lasx_xvreplgr2vr_w((
char)i);
2399 return v_uint32x8(__lasx_xvperm_w(a.val, perm));
2404{
return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2408{
return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2414 __m256i t0 = __lasx_xvld(ptr, 0);
2415 __m256i t1 = __lasx_xvld(ptr, 32);
2417 __m256i p0 = __lasx_xvpickev_b(t1, t0);
2418 __m256i p1 = __lasx_xvpickod_b(t1, t0);
2420 a.val = __lasx_xvpermi_d(p0, 0xd8);
2421 b.val = __lasx_xvpermi_d(p1, 0xd8);
2426 __m256i t0 = __lasx_xvld(ptr, 0);
2427 __m256i t1 = __lasx_xvld(ptr, 32);
2429 __m256i p0 = __lasx_xvpickev_h(t1, t0);
2430 __m256i p1 = __lasx_xvpickod_h(t1, t0);
2432 a.val = __lasx_xvpermi_d(p0, 0xd8);
2433 b.val = __lasx_xvpermi_d(p1, 0xd8);
2438 __m256i t0 = __lasx_xvld(ptr, 0);
2439 __m256i t1 = __lasx_xvld(ptr, 32);
2441 __m256i p0 = __lasx_xvpickev_w(t1, t0);
2442 __m256i p1 = __lasx_xvpickod_w(t1, t0);
2444 a.val = __lasx_xvpermi_d(p0, 0xd8);
2445 b.val = __lasx_xvpermi_d(p1, 0xd8);
2450 __m256i ab0 = __lasx_xvld(ptr, 0);
2451 __m256i ab1 = __lasx_xvld(ptr, 32);
2453 __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02);
2454 __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13);
2455 __m256i a0 = __lasx_xvilvl_d(ph, pl);
2456 __m256i b0 = __lasx_xvilvh_d(ph, pl);
2463 __m256i bgr0 = __lasx_xvld(ptr, 0);
2464 __m256i bgr1 = __lasx_xvld(ptr, 32);
2465 __m256i bgr2 = __lasx_xvld(ptr, 64);
2467 __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2468 __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2470 const __m256i m0 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2471 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2472 const __m256i m1 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2473 -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
2475 __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
2476 __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
2477 __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
2480 sh_b = _v256_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
2481 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
2482 sh_g = _v256_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
2483 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
2484 sh_r = _v256_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
2485 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2486 b0 = __lasx_xvshuf_b(b0, b0, sh_b);
2487 g0 = __lasx_xvshuf_b(g0, g0, sh_g);
2488 r0 = __lasx_xvshuf_b(r0, r0, sh_r);
2497 __m256i bgr0 = __lasx_xvld(ptr, 0);
2498 __m256i bgr1 = __lasx_xvld(ptr, 32);
2499 __m256i bgr2 = __lasx_xvld(ptr, 64);
2501 __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2502 __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2504 const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2505 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2506 const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2507 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2508 __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
2509 __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
2510 __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
2511 const __m256i sh_b = _v256_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2512 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2513 const __m256i sh_g = _v256_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2514 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2515 const __m256i sh_r = _v256_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2516 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2517 b0 = __lasx_xvshuf_b(b0, b0, sh_b);
2518 g0 = __lasx_xvshuf_b(g0, g0, sh_g);
2519 r0 = __lasx_xvshuf_b(r0, r0, sh_r);
2521 a = v_uint16x16(b0);
2522 b = v_uint16x16(g0);
2523 c = v_uint16x16(r0);
2526inline void v_load_deinterleave(
const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
2528 __m256i bgr0 = __lasx_xvld(ptr, 0);
2529 __m256i bgr1 = __lasx_xvld(ptr, 32);
2530 __m256i bgr2 = __lasx_xvld(ptr, 64);
2532 __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2533 __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2535 __m256i m24 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
2536 __m256i m92 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
2537 __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m24), bgr1, m92);
2538 __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m92), bgr1, m24);
2539 __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m24), s02_high, m92);
2541 b0 = __lasx_xvshuf4i_w(b0, 0x6c);
2542 g0 = __lasx_xvshuf4i_w(g0, 0xb1);
2543 r0 = __lasx_xvshuf4i_w(r0, 0xc6);
2552 __m256i bgr0 = __lasx_xvld(ptr, 0);
2553 __m256i bgr1 = __lasx_xvld(ptr, 32);
2554 __m256i bgr2 = __lasx_xvld(ptr, 64);
2556 __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12);
2557 __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12);
2558 __m256i s20r = __lasx_xvpermi_d(__lasx_xvpermi_q(bgr2, bgr0, 0x12), 0x1b);
2559 __m256i b0 = __lasx_xvilvl_d(s20r, s01);
2560 __m256i g0 = _v256_alignr_b(s12, s01, 8);
2561 __m256i r0 = __lasx_xvilvh_d(s12, s20r);
2570 __m256i t0 = __lasx_xvld(ptr, 0);
2571 __m256i t1 = __lasx_xvld(ptr, 32);
2572 __m256i t2 = __lasx_xvld(ptr, 64);
2573 __m256i t3 = __lasx_xvld(ptr, 96);
2575 const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
2576 __m256i ac_lo = __lasx_xvpickev_b(t1, t0);
2577 __m256i bd_lo = __lasx_xvpickod_b(t1, t0);
2578 __m256i ac_hi = __lasx_xvpickev_b(t3, t2);
2579 __m256i bd_hi = __lasx_xvpickod_b(t3, t2);
2581 __m256i a_pre = __lasx_xvpickev_b(ac_hi, ac_lo);
2582 __m256i c_pre = __lasx_xvpickod_b(ac_hi, ac_lo);
2583 __m256i b_pre = __lasx_xvpickev_b(bd_hi, bd_lo);
2584 __m256i d_pre = __lasx_xvpickod_b(bd_hi, bd_lo);
2586 a.val = __lasx_xvperm_w(a_pre, sh);
2587 b.val = __lasx_xvperm_w(b_pre, sh);
2588 c.val = __lasx_xvperm_w(c_pre, sh);
2589 d.val = __lasx_xvperm_w(d_pre, sh);
2594 __m256i t0 = __lasx_xvld(ptr, 0);
2595 __m256i t1 = __lasx_xvld(ptr, 32);
2596 __m256i t2 = __lasx_xvld(ptr, 64);
2597 __m256i t3 = __lasx_xvld(ptr, 96);
2599 const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
2600 __m256i ac_lo = __lasx_xvpickev_h(t1, t0);
2601 __m256i bd_lo = __lasx_xvpickod_h(t1, t0);
2602 __m256i ac_hi = __lasx_xvpickev_h(t3, t2);
2603 __m256i bd_hi = __lasx_xvpickod_h(t3, t2);
2605 __m256i a_pre = __lasx_xvpickev_h(ac_hi, ac_lo);
2606 __m256i c_pre = __lasx_xvpickod_h(ac_hi, ac_lo);
2607 __m256i b_pre = __lasx_xvpickev_h(bd_hi, bd_lo);
2608 __m256i d_pre = __lasx_xvpickod_h(bd_hi, bd_lo);
2610 a.val = __lasx_xvperm_w(a_pre, sh);
2611 b.val = __lasx_xvperm_w(b_pre, sh);
2612 c.val = __lasx_xvperm_w(c_pre, sh);
2613 d.val = __lasx_xvperm_w(d_pre, sh);
2616inline void v_load_deinterleave(
const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
2618 __m256i p0 = __lasx_xvld(ptr, 0);
2619 __m256i p1 = __lasx_xvld(ptr, 32);
2620 __m256i p2 = __lasx_xvld(ptr, 64);
2621 __m256i p3 = __lasx_xvld(ptr, 96);
2623 __m256i p01l = __lasx_xvilvl_w(p1, p0);
2624 __m256i p01h = __lasx_xvilvh_w(p1, p0);
2625 __m256i p23l = __lasx_xvilvl_w(p3, p2);
2626 __m256i p23h = __lasx_xvilvh_w(p3, p2);
2628 __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
2629 __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
2630 __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
2631 __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
2633 __m256i b0 = __lasx_xvilvl_w(plh, pll);
2634 __m256i g0 = __lasx_xvilvh_w(plh, pll);
2635 __m256i r0 = __lasx_xvilvl_w(phh, phl);
2636 __m256i a0 = __lasx_xvilvh_w(phh, phl);
2646 __m256i bgra0 = __lasx_xvld(ptr, 0);
2647 __m256i bgra1 = __lasx_xvld(ptr, 32);
2648 __m256i bgra2 = __lasx_xvld(ptr, 64);
2649 __m256i bgra3 = __lasx_xvld(ptr, 96);
2651 __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02);
2652 __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13);
2653 __m256i l13 = __lasx_xvpermi_q(bgra1, bgra3, 0x02);
2654 __m256i h13 = __lasx_xvpermi_q(bgra1, bgra3, 0x13);
2656 __m256i b0 = __lasx_xvilvl_d(l13, l02);
2657 __m256i g0 = __lasx_xvilvh_d(l13, l02);
2658 __m256i r0 = __lasx_xvilvl_d(h13, h02);
2659 __m256i a0 = __lasx_xvilvh_d(h13, h02);
2672 __m256i xy_l = __lasx_xvilvl_b(
y.val,
x.val);
2673 __m256i xy_h = __lasx_xvilvh_b(
y.val,
x.val);
2675 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2676 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2678 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2679 __lasx_xvst(xy1, (__m256i*)ptr, 32*1);
2685 __m256i xy_l = __lasx_xvilvl_h(
y.val,
x.val);
2686 __m256i xy_h = __lasx_xvilvh_h(
y.val,
x.val);
2688 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2689 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2691 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2692 __lasx_xvst(xy1, (__m256i*)ptr, 16*2);
2698 __m256i xy_l = __lasx_xvilvl_w(
y.val,
x.val);
2699 __m256i xy_h = __lasx_xvilvh_w(
y.val,
x.val);
2701 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2702 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2704 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2705 __lasx_xvst(xy1, (__m256i*)ptr, 8*4);
2711 __m256i xy_l = __lasx_xvilvl_d(
y.val,
x.val);
2712 __m256i xy_h = __lasx_xvilvh_d(
y.val,
x.val);
2714 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2715 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2717 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2718 __lasx_xvst(xy1, (__m256i*)ptr, 4*8);
2724 const __m256i sh_b = _v256_setr_b(
2725 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2726 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2727 const __m256i sh_g = _v256_setr_b(
2728 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2729 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2730 const __m256i sh_r = _v256_setr_b(
2731 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2732 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2734 __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
2735 __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
2736 __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
2738 const __m256i m0 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2739 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2740 const __m256i m1 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2741 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2743 __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
2744 __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
2745 __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
2747 __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
2748 __m256i bgr1 = __lasx_xvpermi_q(p0, p2, 0 + 3*16);
2749 __m256i bgr2 = __lasx_xvpermi_q(p2, p1, 1 + 3*16);
2751 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2752 __lasx_xvst(bgr1, (__m256i*)ptr, 32);
2753 __lasx_xvst(bgr2, (__m256i*)ptr, 64);
2759 const __m256i sh_b = _v256_setr_b(
2760 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2761 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2762 const __m256i sh_g = _v256_setr_b(
2763 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2764 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2765 const __m256i sh_r = _v256_setr_b(
2766 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2767 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2769 __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
2770 __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
2771 __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
2773 const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2774 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2775 const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2776 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2778 __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
2779 __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
2780 __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
2782 __m256i bgr0 = __lasx_xvpermi_q(p2, p0, 0 + 2*16);
2783 __m256i bgr2 = __lasx_xvpermi_q(p2, p0, 1 + 3*16);
2785 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2786 __lasx_xvst(p1, (__m256i*)ptr, 16*2);
2787 __lasx_xvst(bgr2, (__m256i*)ptr, 32*2);
2790inline void v_store_interleave(
unsigned* ptr,
const v_uint32x8& a,
const v_uint32x8& b,
const v_uint32x8& c,
2793 __m256i b0 = __lasx_xvshuf4i_w(a.val, 0x6c);
2794 __m256i g0 = __lasx_xvshuf4i_w(b.val, 0xb1);
2795 __m256i r0 = __lasx_xvshuf4i_w(c.val, 0xc6);
2797 __m256i bitmask_1 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
2798 __m256i bitmask_2 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
2800 __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, bitmask_1), r0, bitmask_2);
2801 __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, bitmask_1), b0, bitmask_2);
2802 __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, bitmask_1), g0, bitmask_2);
2804 __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
2805 __m256i bgr2 = __lasx_xvpermi_q(p1, p0, 1 + 3*16);
2807 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2808 __lasx_xvst(p2, (__m256i*)ptr, 8*4);
2809 __lasx_xvst(bgr2, (__m256i*)ptr, 16*4);
2815 __m256i s01 = __lasx_xvilvl_d(b.val, a.val);
2816 __m256i s12 = __lasx_xvilvh_d(c.val, b.val);
2817 __m256i s20 = __lasx_xvpermi_w(a.val, c.val, 0xe4);
2819 __m256i bgr0 = __lasx_xvpermi_q(s20, s01, 0 + 2*16);
2820 __m256i bgr1 = __lasx_xvpermi_q(s01, s12, 0x30);
2821 __m256i bgr2 = __lasx_xvpermi_q(s12, s20, 1 + 3*16);
2823 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2824 __lasx_xvst(bgr1, (__m256i*)ptr, 4*8);
2825 __lasx_xvst(bgr2, (__m256i*)ptr, 8*8);
2829 const v_uint8x32& c,
const v_uint8x32& d,
2832 __m256i bg0 = __lasx_xvilvl_b(b.val, a.val);
2833 __m256i bg1 = __lasx_xvilvh_b(b.val, a.val);
2834 __m256i ra0 = __lasx_xvilvl_b(d.val, c.val);
2835 __m256i ra1 = __lasx_xvilvh_b(d.val, c.val);
2837 __m256i bgra0_ = __lasx_xvilvl_h(ra0, bg0);
2838 __m256i bgra1_ = __lasx_xvilvh_h(ra0, bg0);
2839 __m256i bgra2_ = __lasx_xvilvl_h(ra1, bg1);
2840 __m256i bgra3_ = __lasx_xvilvh_h(ra1, bg1);
2842 __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2843 __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2844 __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2845 __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2847 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2848 __lasx_xvst(bgra1, (__m256i*)ptr, 32);
2849 __lasx_xvst(bgra2, (__m256i*)ptr, 64);
2850 __lasx_xvst(bgra3, (__m256i*)ptr, 96);
2854 const v_uint16x16& c,
const v_uint16x16& d,
2857 __m256i bg0 = __lasx_xvilvl_h(b.val, a.val);
2858 __m256i bg1 = __lasx_xvilvh_h(b.val, a.val);
2859 __m256i ra0 = __lasx_xvilvl_h(d.val, c.val);
2860 __m256i ra1 = __lasx_xvilvh_h(d.val, c.val);
2862 __m256i bgra0_ = __lasx_xvilvl_w(ra0, bg0);
2863 __m256i bgra1_ = __lasx_xvilvh_w(ra0, bg0);
2864 __m256i bgra2_ = __lasx_xvilvl_w(ra1, bg1);
2865 __m256i bgra3_ = __lasx_xvilvh_w(ra1, bg1);
2867 __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2868 __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2869 __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2870 __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2872 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2873 __lasx_xvst(bgra1, (__m256i*)ptr, 16*2);
2874 __lasx_xvst(bgra2, (__m256i*)ptr, 32*2);
2875 __lasx_xvst(bgra3, (__m256i*)ptr, 48*2);
2878inline void v_store_interleave(
unsigned* ptr,
const v_uint32x8& a,
const v_uint32x8& b,
2879 const v_uint32x8& c,
const v_uint32x8& d,
2882 __m256i bg0 = __lasx_xvilvl_w(b.val, a.val);
2883 __m256i bg1 = __lasx_xvilvh_w(b.val, a.val);
2884 __m256i ra0 = __lasx_xvilvl_w(d.val, c.val);
2885 __m256i ra1 = __lasx_xvilvh_w(d.val, c.val);
2887 __m256i bgra0_ = __lasx_xvilvl_d(ra0, bg0);
2888 __m256i bgra1_ = __lasx_xvilvh_d(ra0, bg0);
2889 __m256i bgra2_ = __lasx_xvilvl_d(ra1, bg1);
2890 __m256i bgra3_ = __lasx_xvilvh_d(ra1, bg1);
2892 __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2893 __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2894 __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2895 __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2897 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2898 __lasx_xvst(bgra1, (__m256i*)ptr, 8*4);
2899 __lasx_xvst(bgra2, (__m256i*)ptr, 16*4);
2900 __lasx_xvst(bgra3, (__m256i*)ptr, 24*4);
2904 const v_uint64x4& c,
const v_uint64x4& d,
2907 __m256i bg0 = __lasx_xvilvl_d(b.val, a.val);
2908 __m256i bg1 = __lasx_xvilvh_d(b.val, a.val);
2909 __m256i ra0 = __lasx_xvilvl_d(d.val, c.val);
2910 __m256i ra1 = __lasx_xvilvh_d(d.val, c.val);
2912 __m256i bgra0 = __lasx_xvpermi_q(ra0, bg0, 0 + 2*16);
2913 __m256i bgra1 = __lasx_xvpermi_q(ra1, bg1, 0 + 2*16);
2914 __m256i bgra2 = __lasx_xvpermi_q(ra0, bg0, 1 + 3*16);
2915 __m256i bgra3 = __lasx_xvpermi_q(ra1, bg1, 1 + 3*16);
2917 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2918 __lasx_xvst(bgra1, (__m256i*)(ptr), 4*8);
2919 __lasx_xvst(bgra2, (__m256i*)(ptr), 8*8);
2920 __lasx_xvst(bgra3, (__m256i*)(ptr), 12*8);
2924#define OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2925inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2928 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2929 a0 = v_reinterpret_as_##suffix0(a1); \
2930 b0 = v_reinterpret_as_##suffix0(b1); \
2932inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2934 _Tpvec1 a1, b1, c1; \
2935 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2936 a0 = v_reinterpret_as_##suffix0(a1); \
2937 b0 = v_reinterpret_as_##suffix0(b1); \
2938 c0 = v_reinterpret_as_##suffix0(c1); \
2940inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2942 _Tpvec1 a1, b1, c1, d1; \
2943 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2944 a0 = v_reinterpret_as_##suffix0(a1); \
2945 b0 = v_reinterpret_as_##suffix0(b1); \
2946 c0 = v_reinterpret_as_##suffix0(c1); \
2947 d0 = v_reinterpret_as_##suffix0(d1); \
2949inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2950 hal::StoreMode =hal::STORE_UNALIGNED ) \
2952 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2953 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2954 v_store_interleave((_Tp1*)ptr, a1, b1); \
2956inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
2957 hal::StoreMode =hal::STORE_UNALIGNED ) \
2959 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2960 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2961 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2962 v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
2964inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2965 const _Tpvec0& c0, const _Tpvec0& d0, \
2966 hal::StoreMode =hal::STORE_UNALIGNED ) \
2968 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2969 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2970 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2971 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2972 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
2975OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int8x32,
schar, s8, v_uint8x32,
uchar, u8)
2976OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int16x16,
short, s16, v_uint16x16,
ushort, u16)
2977OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int32x8,
int, s32, v_uint32x8,
unsigned, u32)
2978OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float32x8,
float, f32, v_uint32x8,
unsigned, u32)
2979OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int64x4,
int64, s64, v_uint64x4,
uint64, u64)
2980OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4,
double, f64, v_uint64x4,
uint64, u64)
2986inline v_float32x8 v256_load_expand(
const hfloat* ptr)
2990 return v_float32x8(__lasx_xvfcvtl_s_h(__lasx_xvpermi_d(__lsx_vld((
const __m128i*)ptr, 0), 0x10)));
2993 for (
int i = 0; i < 8; i++)
2994 buf[i] = (
float)ptr[i];
2995 return v256_load_aligned(buf);
2999inline void v_pack_store(hfloat* ptr,
const v_float32x8& a)
3002 __m256i ah = __lasx_xvfcvt_h_s(a.val, a.val);
3003 __lsx_vst((_m128i)ah, ptr, 0);
3007 for (
int i = 0; i < 8; i++)
3008 ptr[i] = hfloat(buf[i]);
3016inline void v256_cleanup() {}
3018CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr const CvArr CvArr * result
Definition core_c.h:1423
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition intrin_cpp.hpp:1554
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition intrin_cpp.hpp:2216
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition intrin_cpp.hpp:1451
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition intrin_cpp.hpp:1496
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition intrin_cpp.hpp:1474
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition intrin_cpp.hpp:2761
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition intrin_cpp.hpp:1515
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
StoreMode
Definition intrin.hpp:100
@ STORE_UNALIGNED
Definition intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition dualquaternion.inl.hpp:274