EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_lasx.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html
4
5#ifndef OPENCV_HAL_INTRIN_LASX_HPP
6#define OPENCV_HAL_INTRIN_LASX_HPP
7
8#include <lsxintrin.h>
9#include <lasxintrin.h>
10
11#define CV_SIMD256 1
12#define CV_SIMD256_64F 1
13#define CV_SIMD256_FP16 0
14
15namespace cv
16{
17
19
20CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
21
23
24inline __m256i _v256_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8, char v9,
25 char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19,
26 char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29,
27 char v30, char v31)
28{
29 return (__m256i)v32i8{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
30 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
31 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
32 v30, v31 };
33}
34
35inline __m256i _v256_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8, char v9,
36 char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19,
37 char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29,
38 char v30, char v31)
39{
40 return (__m256i)v32i8{ v31, v30,
41 v29, v28, v27, v26, v25, v24, v23, v22, v21, v20,
42 v19, v18, v17, v16, v15, v14, v13, v12, v11, v10,
43 v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 };
44}
45
46inline __m256i _v256_setr_h(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7,
47 short v8, short v9, short v10, short v11, short v12, short v13, short v14, short v15)
48{
49 return (__m256i)v16i16{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 };
50}
51
52inline __m256i _v256_setr_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
53{
54 return (__m256i)v8i32{ v0, v1, v2, v3, v4, v5, v6, v7 };
55}
56
57inline __m256i _v256_set_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
58{
59 return (__m256i)v8i32{ v7, v6, v5, v4, v3, v2, v1, v0 };
60}
61
62inline __m256i _v256_setall_w(int v0)
63{
64 return (__m256i)v8i32{ v0, v0, v0, v0, v0, v0, v0, v0 };
65}
66
67inline __m256i _v256_setr_d(int64 v0, int64 v1, int64 v2, int64 v3)
68{
69 return (__m256i)v4i64{ v0, v1, v2, v3 };
70}
71
72inline __m256i _v256_set_d(int64 v0, int64 v1, int64 v2, int64 v3)
73{
74 return (__m256i)v4i64{ v3, v2, v1, v0 };
75}
76
77inline __m256 _v256_setr_ps(float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7)
78{
79 return (__m256)v8f32{ v0, v1, v2, v3, v4, v5, v6, v7 };
80}
81
82inline __m256 _v256_setall_ps(float f32)
83{
84 return (__m256)v8f32{ f32, f32, f32, f32, f32, f32, f32, f32 };
85}
86
87inline __m256d _v256_setr_pd(double v0, double v1, double v2, double v3)
88{
89 return (__m256d)v4f64{ v0, v1, v2, v3 };
90}
91
92inline __m256d _v256_setall_pd(double f64)
93{
94 return (__m256d)v4f64{ f64, f64, f64, f64 };
95}
96
97inline __m256i _lasx_packus_h(const __m256i& a, const __m256i& b)
98{
99 return __lasx_xvssrarni_bu_h(b, a, 0);
100}
101
102inline __m256i _lasx_packs_h(const __m256i& a, const __m256i& b)
103{
104 return __lasx_xvssrarni_b_h(b, a, 0);
105}
106
107inline __m256i _lasx_packus_w(const __m256i& a, const __m256i& b)
108{
109 return __lasx_xvssrarni_hu_w(b, a, 0);
110}
111
112inline __m256i _lasx_packs_w(const __m256i& a, const __m256i& b)
113{
114 return __lasx_xvssrarni_h_w(b, a, 0);
115}
116
117inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
118{ return __lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02); }
119
120inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
121{ return __m256(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
122
123inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
124{ return __m256d(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
125
126inline __m256i _v256_shuffle_odd_64(const __m256i& v)
127{ return __lasx_xvpermi_d(v, 0xd8); }
128
129inline __m256d _v256_shuffle_odd_64(const __m256d& v)
130{ return __m256d(__lasx_xvpermi_d(*((__m256i*)&v), 0xd8)); }
131
132//LASX: only use for permute WITHOUT zero clearing
133template<int imm>
134inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
135{ return __lasx_xvpermi_q(a, b, imm); }
136
137template<int imm>
138inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
139{ return __m256(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
140
141template<int imm>
142inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
143{ return __m256d(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
144
145template<int imm, typename _Tpvec>
146inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
147{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
148
149template<int imm>
150inline __m256i _v256_permute4x64(const __m256i& a)
151{ return __lasx_xvpermi_d(a, imm); }
152
153template<int imm>
154inline __m256d _v256_permute4x64(const __m256d& a)
155{ return __m256d(__lasx_xvpermi_d(*((__m256i*)&a), imm)); }
156
157template<int imm, typename _Tpvec>
158inline _Tpvec v256_permute4x64(const _Tpvec& a)
159{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
160
161inline __m128i _v256_extract_high(const __m256i& v)
162{ __m256i temp256i = __lasx_xvpermi_d(v, 0x4E);
163 return *((__m128i*)&temp256i); }
164
165inline __m128 _v256_extract_high(const __m256& v)
166{ return __m128(_v256_extract_high(*((__m256i*)&v))); }
167
168inline __m128d _v256_extract_high(const __m256d& v)
169{ return __m128d(_v256_extract_high(*((__m256i*)&v))); }
170
171inline __m128i _v256_extract_low(const __m256i& v)
172{ return *((__m128i*)&v); }
173
174inline __m128 _v256_extract_low(const __m256& v)
175{ return __m128(_v256_extract_low(*((__m256i*)&v))); }
176
177inline __m128d _v256_extract_low(const __m256d& v)
178{ return __m128d(_v256_extract_low(*((__m256i*)&v))); }
179
180inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
181{
182 return __lasx_xvssrlrni_hu_w(b, a, 0);
183}
184
185template<int i>
186inline int _v256_extract_b(const __m256i& a)
187{
188 int des[1] = {0};
189 __lasx_xvstelm_b(a, des, 0, i);
190 return des[0];
191}
192
193template<int i>
194inline int _v256_extract_h(const __m256i& a)
195{
196 int des[1] = {0};
197 __lasx_xvstelm_h(a, des, 0, i);
198 return des[0];
199}
200
201template<int i>
202inline int _v256_extract_w(const __m256i& a)
203{
204 return __lasx_xvpickve2gr_w(a, i);
205}
206
207template<int i>
208inline int64 _v256_extract_d(const __m256i& a)
209{
210 return __lasx_xvpickve2gr_d(a, i);
211}
212
214
215struct v_uint8x32
216{
217 typedef uchar lane_type;
218 enum { nlanes = 32 };
219 __m256i val;
220
221 explicit v_uint8x32(__m256i v) : val(v) {}
222 v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3,
223 uchar v4, uchar v5, uchar v6, uchar v7,
224 uchar v8, uchar v9, uchar v10, uchar v11,
225 uchar v12, uchar v13, uchar v14, uchar v15,
226 uchar v16, uchar v17, uchar v18, uchar v19,
227 uchar v20, uchar v21, uchar v22, uchar v23,
228 uchar v24, uchar v25, uchar v26, uchar v27,
229 uchar v28, uchar v29, uchar v30, uchar v31)
230 {
231 val = _v256_setr_b((char)v0, (char)v1, (char)v2, (char)v3,
232 (char)v4, (char)v5, (char)v6 , (char)v7, (char)v8, (char)v9,
233 (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
234 (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
235 (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
236 (char)v28, (char)v29, (char)v30, (char)v31);
237 }
238 /* coverity[uninit_ctor]: suppress warning */
239 v_uint8x32() {}
240
241 uchar get0() const {
242 uchar des[1] = {0};
243 __lasx_xvstelm_b(val, des, 0, 0);
244 return des[0];
245 }
246};
247
248struct v_int8x32
249{
250 typedef schar lane_type;
251 enum { nlanes = 32 };
252 __m256i val;
253
254 explicit v_int8x32(__m256i v) : val(v) {}
255 v_int8x32(schar v0, schar v1, schar v2, schar v3,
256 schar v4, schar v5, schar v6, schar v7,
257 schar v8, schar v9, schar v10, schar v11,
258 schar v12, schar v13, schar v14, schar v15,
259 schar v16, schar v17, schar v18, schar v19,
260 schar v20, schar v21, schar v22, schar v23,
261 schar v24, schar v25, schar v26, schar v27,
262 schar v28, schar v29, schar v30, schar v31)
263 {
264 val = _v256_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
265 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
266 v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
267 }
268 /* coverity[uninit_ctor]: suppress warning */
269 v_int8x32() {}
270
271 schar get0() const {
272 schar des[1] = {0};
273 __lasx_xvstelm_b(val, des, 0, 0);
274 return des[0];
275 }
276};
277
278struct v_uint16x16
279{
280 typedef ushort lane_type;
281 enum { nlanes = 16 };
282 __m256i val;
283
284 explicit v_uint16x16(__m256i v) : val(v) {}
285 v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3,
286 ushort v4, ushort v5, ushort v6, ushort v7,
287 ushort v8, ushort v9, ushort v10, ushort v11,
288 ushort v12, ushort v13, ushort v14, ushort v15)
289 {
290 val = _v256_setr_h((short)v0, (short)v1, (short)v2, (short)v3,
291 (short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9,
292 (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
293 }
294 /* coverity[uninit_ctor]: suppress warning */
295 v_uint16x16() {}
296
297 ushort get0() const {
298 ushort des[1] = {0};
299 __lasx_xvstelm_h(val, des, 0, 0);
300 return des[0];
301 }
302};
303
304struct v_int16x16
305{
306 typedef short lane_type;
307 enum { nlanes = 16 };
308 __m256i val;
309
310 explicit v_int16x16(__m256i v) : val(v) {}
311 v_int16x16(short v0, short v1, short v2, short v3,
312 short v4, short v5, short v6, short v7,
313 short v8, short v9, short v10, short v11,
314 short v12, short v13, short v14, short v15)
315 {
316 val = _v256_setr_h(v0, v1, v2, v3, v4, v5, v6, v7,
317 v8, v9, v10, v11, v12, v13, v14, v15);
318 }
319 /* coverity[uninit_ctor]: suppress warning */
320 v_int16x16() {}
321
322 short get0() const {
323 short des[1] = {0};
324 __lasx_xvstelm_h(val, des, 0, 0);
325 return des[0];
326 }
327};
328
329struct v_uint32x8
330{
331 typedef unsigned lane_type;
332 enum { nlanes = 8 };
333 __m256i val;
334
335 explicit v_uint32x8(__m256i v) : val(v) {}
336 v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
337 unsigned v4, unsigned v5, unsigned v6, unsigned v7)
338 {
339 val = _v256_setr_w((unsigned)v0, (unsigned)v1, (unsigned)v2,
340 (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
341 }
342 /* coverity[uninit_ctor]: suppress warning */
343 v_uint32x8() {}
344
345 unsigned get0() const { return __lasx_xvpickve2gr_wu(val, 0); }
346};
347
348struct v_int32x8
349{
350 typedef int lane_type;
351 enum { nlanes = 8 };
352 __m256i val;
353
354 explicit v_int32x8(__m256i v) : val(v) {}
355 v_int32x8(int v0, int v1, int v2, int v3,
356 int v4, int v5, int v6, int v7)
357 {
358 val = _v256_setr_w(v0, v1, v2, v3, v4, v5, v6, v7);
359 }
360 /* coverity[uninit_ctor]: suppress warning */
361 v_int32x8() {}
362
363 int get0() const { return __lasx_xvpickve2gr_w(val, 0); }
364};
365
366struct v_float32x8
367{
368 typedef float lane_type;
369 enum { nlanes = 8 };
370 __m256 val;
371
372 explicit v_float32x8(__m256 v) : val(v) {}
373 explicit v_float32x8(__m256i v) { val = *((__m256*)&v); }
374 v_float32x8(float v0, float v1, float v2, float v3,
375 float v4, float v5, float v6, float v7)
376 {
377 val = _v256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
378 }
379 /* coverity[uninit_ctor]: suppress warning */
380 v_float32x8() {}
381
382 float get0() const {
383 float des[1] = {0};
384 __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
385 return des[0];
386 }
387
388 int get0toint() const {
389 int des[1] = {0};
390 __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
391 return des[0];
392 }
393};
394
395struct v_uint64x4
396{
397 typedef uint64 lane_type;
398 enum { nlanes = 4 };
399 __m256i val;
400
401 explicit v_uint64x4(__m256i v) : val(v) {}
402 v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
403 { val = _v256_setr_d((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
404 /* coverity[uninit_ctor]: suppress warning */
405 v_uint64x4() {}
406
407 uint64 get0() const
408 {
409 return __lasx_xvpickve2gr_du(val, 0);
410 }
411};
412
413struct v_int64x4
414{
415 typedef int64 lane_type;
416 enum { nlanes = 4 };
417 __m256i val;
418
419 explicit v_int64x4(__m256i v) : val(v) {}
420 v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
421 { val = _v256_setr_d(v0, v1, v2, v3); }
422 /* coverity[uninit_ctor]: suppress warning */
423 v_int64x4() {}
424
425 int64 get0() const
426 {
427 return __lasx_xvpickve2gr_d(val, 0);
428 }
429};
430
431struct v_float64x4
432{
433 typedef double lane_type;
434 enum { nlanes = 4 };
435 __m256d val;
436
437 explicit v_float64x4(__m256d v) : val(v) {}
438 explicit v_float64x4(__m256i v) { val = *((__m256d*)&v); }
439 v_float64x4(double v0, double v1, double v2, double v3)
440 { val = _v256_setr_pd(v0, v1, v2, v3); }
441 /* coverity[uninit_ctor]: suppress warning */
442 v_float64x4() {}
443
444 double get0() const {
445 double des[1] = {0};
446 __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
447 return des[0];
448 }
449
450 int64 get0toint64() const {
451 int64 des[1] = {0};
452 __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
453 return des[0];
454 }
455};
456
458
459#define OPENCV_HAL_IMPL_LASX_LOADSTORE(_Tpvec, _Tp) \
460 inline _Tpvec v256_load(const _Tp* ptr) \
461 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
462 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
463 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
464 inline _Tpvec v256_load_low(const _Tp* ptr) \
465 { \
466 __m128i v128 = __lsx_vld(ptr, 0); \
467 return _Tpvec(*((__m256i*)&v128)); \
468 } \
469 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
470 { \
471 __m128i vlo = __lsx_vld(ptr0, 0); \
472 __m128i vhi = __lsx_vld(ptr1, 0); \
473 return _Tpvec(_v256_combine(vlo, vhi)); \
474 } \
475 inline void v_store(_Tp* ptr, const _Tpvec& a) \
476 { __lasx_xvst(a.val, ptr, 0); } \
477 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
478 { __lasx_xvst(a.val, ptr, 0); } \
479 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
480 { __lasx_xvst(a.val, ptr, 0); } \
481 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
482 { \
483 if( mode == hal::STORE_UNALIGNED ) \
484 __lasx_xvst(a.val, ptr, 0); \
485 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
486 __lasx_xvst(a.val, ptr, 0); \
487 else \
488 __lasx_xvst(a.val, ptr, 0); \
489 } \
490 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
491 { __lsx_vst(_v256_extract_low(a.val), ptr, 0); } \
492 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
493 { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
494
495OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint8x32, uchar)
496OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int8x32, schar)
497OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint16x16, ushort)
498OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int16x16, short)
499OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint32x8, unsigned)
500OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int32x8, int)
501OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint64x4, uint64)
502OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int64x4, int64)
503
504
505#define OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg) \
506 inline _Tpvec v256_load(const _Tp* ptr) \
507 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
508 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
509 { return _Tpvec(__lasx_xvld(ptr, 0)); } \
510 inline _Tpvec v256_load_low(const _Tp* ptr) \
511 { \
512 __m128i v128 = __lsx_vld(ptr, 0); \
513 return _Tpvec(*((__m256i*)&v128)); \
514 } \
515 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
516 { \
517 halfreg vlo = __lsx_vld(ptr0, 0); \
518 halfreg vhi = __lsx_vld(ptr1, 0); \
519 return _Tpvec(_v256_combine(vlo, vhi)); \
520 } \
521 inline void v_store(_Tp* ptr, const _Tpvec& a) \
522 { __lasx_xvst(a.val, ptr, 0); } \
523 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
524 { __lasx_xvst(a.val, ptr, 0); } \
525 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
526 { __lasx_xvst(a.val, ptr, 0); } \
527 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
528 { \
529 if( mode == hal::STORE_UNALIGNED ) \
530 __lasx_xvst(a.val, ptr, 0); \
531 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
532 __lasx_xvst(a.val, ptr, 0); \
533 else \
534 __lasx_xvst(a.val, ptr, 0); \
535 } \
536 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
537 { __lsx_vst(_v256_extract_low(a.val), ptr, 0); } \
538 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
539 { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
540
541OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float32x8, float, __m128i)
542OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float64x4, double, __m128i)
543
544
545inline __m256i _lasx_256_castps_si256(const __m256& v)
546{ return __m256i(v); }
547
548inline __m256i _lasx_256_castpd_si256(const __m256d& v)
549{ return __m256i(v); }
550
551#define OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
552 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
553 { return _Tpvec(cast(a.val)); }
554
555#define OPENCV_HAL_IMPL_LASX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
556 inline _Tpvec v256_setzero_##suffix() \
557 { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
558 inline _Tpvec v256_setall_##suffix(_Tp v) \
559 { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); } \
560 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
561 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
562 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
563 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
564 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
565 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
566 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
567 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
568 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float32x8, suffix, _lasx_256_castps_si256) \
569 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float64x4, suffix, _lasx_256_castpd_si256)
570
571OPENCV_HAL_IMPL_LASX_INIT(v_uint8x32, uchar, u8, b, int)
572OPENCV_HAL_IMPL_LASX_INIT(v_int8x32, schar, s8, b, int)
573OPENCV_HAL_IMPL_LASX_INIT(v_uint16x16, ushort, u16, h, int)
574OPENCV_HAL_IMPL_LASX_INIT(v_int16x16, short, s16, h, int)
575OPENCV_HAL_IMPL_LASX_INIT(v_uint32x8, unsigned, u32, w, int)
576OPENCV_HAL_IMPL_LASX_INIT(v_int32x8, int, s32, w, int)
577OPENCV_HAL_IMPL_LASX_INIT(v_uint64x4, uint64, u64, d, long int)
578OPENCV_HAL_IMPL_LASX_INIT(v_int64x4, int64, s64, d, long int)
579
580
581inline __m256 _lasx_256_castsi256_ps(const __m256i &v)
582{ return __m256(v); }
583
584inline __m256d _lasx_256_castsi256_pd(const __m256i &v)
585{ return __m256d(v); }
586
587#define OPENCV_HAL_IMPL_LASX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
588 inline _Tpvec v256_setzero_##suffix() \
589 { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
590 inline _Tpvec v256_setall_##suffix(_Tp v) \
591 { return _Tpvec(_v256_setall_##zsuffix(v)); } \
592 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
593 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, cast) \
594 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
595 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16, suffix, cast) \
596 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
597 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8, suffix, cast) \
598 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
599 OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4, suffix, cast)
600
601OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float32x8, float, f32, ps, _lasx_256_castsi256_ps)
602OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float64x4, double, f64, pd, _lasx_256_castsi256_pd)
603
604inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
605{ return a; }
606inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
607{ return v_float32x8(_lasx_256_castps_si256(__m256(a.val))); }
608
609inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
610{ return a; }
611inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
612{ return v_float64x4(_lasx_256_castpd_si256(__m256d(a.val))); }
613
614
616
617// unpacks
618#define OPENCV_HAL_IMPL_LASX_UNPACK(_Tpvec, suffix) \
619 inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
620 { return _Tpvec(__lasx_xvilvl_##suffix(__m256i(b.val), __m256i(a.val))); } \
621 inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
622 { return _Tpvec(__lasx_xvilvh_##suffix(__m256i(b.val), __m256i(a.val))); }
623
624OPENCV_HAL_IMPL_LASX_UNPACK(v_uint8x32, b)
625OPENCV_HAL_IMPL_LASX_UNPACK(v_int8x32, b)
626OPENCV_HAL_IMPL_LASX_UNPACK(v_uint16x16, h)
627OPENCV_HAL_IMPL_LASX_UNPACK(v_int16x16, h)
628OPENCV_HAL_IMPL_LASX_UNPACK(v_uint32x8, w)
629OPENCV_HAL_IMPL_LASX_UNPACK(v_int32x8, w)
630OPENCV_HAL_IMPL_LASX_UNPACK(v_uint64x4, d)
631OPENCV_HAL_IMPL_LASX_UNPACK(v_int64x4, d)
632OPENCV_HAL_IMPL_LASX_UNPACK(v_float32x8, w)
633OPENCV_HAL_IMPL_LASX_UNPACK(v_float64x4, d)
634
635
636// shuffle
637// todo: emulate 64bit
638#define OPENCV_HAL_IMPL_LASX_SHUFFLE(_Tpvec, intrin) \
639 template<int m> \
640 inline _Tpvec v256_shuffle(const _Tpvec& a) \
641 { return _Tpvec(__lasx_xvshuf4i_##intrin(a.val, m)); }
642
643OPENCV_HAL_IMPL_LASX_SHUFFLE(v_uint32x8, w)
644OPENCV_HAL_IMPL_LASX_SHUFFLE(v_int32x8, w)
645
646template<int m>
647inline v_float32x8 v256_shuffle(const v_float32x8 &a)
648{ return v_float32x8(__lasx_xvshuf4i_w(*((__m256i*)&a.val), m)); }
649
650template<int m>
651inline v_float64x4 v256_shuffle(const v_float64x4 &a)
652{
653 int imm8 = m & 0b0001; //0 or 1
654 if (m & 0x0b0010) imm8 |= 0b0100;
655 //else imm8 |= 0b0000;
656 if (m & 0x0b0100) imm8 |= 0b110000; //2 or 3
657 else imm8 |= 0b100000;
658 if (m & 0x0b1000) imm8 |= 0b11000000;
659 else imm8 |= 0b10000000;
660
661 return v_float64x4(__lasx_xvpermi_d(*((__m256i*)&a.val), imm8));
662}
663template<typename _Tpvec>
664inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
665{
666 ab0 = v256_unpacklo(a, b);
667 ab1 = v256_unpackhi(a, b);
668}
669
670template<typename _Tpvec>
671inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
672{ return _Tpvec(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
673
674inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
675{ return v_float32x8(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
676
677inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
678{ return v_float64x4(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
679
680template<typename _Tpvec>
681inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
682{ return v256_permute2x128<0x03>(a, b); }
683
684inline __m256i _v256_alignr_b(const __m256i &a, const __m256i &b, const int imm)
685{
686 if (imm == 8) {
687 return __lasx_xvshuf4i_d(b, a, 0x9); // b.d1 a.d0 b.d3 a.d2
688 } else {
689 __m256i byteIndex = _v256_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
690 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
691 return __lasx_xvshuf_b(a, b, __lasx_xvadd_b(__lasx_xvreplgr2vr_b(imm), byteIndex));
692 }
693}
694
695template<typename _Tpvec>
696inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
697{ return _Tpvec(_v256_alignr_b(a.val, b.val, 8)); }
698inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
699{ return v_float64x4(__lasx_xvshuf4i_d(b.val, a.val, 0x9)); } // b.d1 a.d0 b.d3 a.d2
700// todo: emulate float32
701
702template<typename _Tpvec>
703inline _Tpvec v256_swap_halves(const _Tpvec& a)
704{ return v256_permute2x128<1>(a, a); }
705
706template<typename _Tpvec>
707inline _Tpvec v256_reverse_64(const _Tpvec& a)
708{ return v256_permute4x64<0x1b>(a); }
709
710
711// ZIP
712#define OPENCV_HAL_IMPL_LASX_ZIP(_Tpvec) \
713 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
714 { return v256_permute2x128<0x02>(a, b); } \
715 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
716 { return v256_permute2x128<0x13>(a, b); } \
717 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
718 _Tpvec& c, _Tpvec& d) \
719 { \
720 _Tpvec a1b0 = v256_alignr_128(a, b); \
721 c = v256_combine_diagonal(a, a1b0); \
722 d = v256_combine_diagonal(a1b0, b); \
723 } \
724 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
725 _Tpvec& ab0, _Tpvec& ab1) \
726 { \
727 _Tpvec ab0ab2, ab1ab3; \
728 v256_zip(a, b, ab0ab2, ab1ab3); \
729 v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
730 }
731
732OPENCV_HAL_IMPL_LASX_ZIP(v_uint8x32)
733OPENCV_HAL_IMPL_LASX_ZIP(v_int8x32)
734OPENCV_HAL_IMPL_LASX_ZIP(v_uint16x16)
735OPENCV_HAL_IMPL_LASX_ZIP(v_int16x16)
736OPENCV_HAL_IMPL_LASX_ZIP(v_uint32x8)
737OPENCV_HAL_IMPL_LASX_ZIP(v_int32x8)
738OPENCV_HAL_IMPL_LASX_ZIP(v_uint64x4)
739OPENCV_HAL_IMPL_LASX_ZIP(v_int64x4)
740OPENCV_HAL_IMPL_LASX_ZIP(v_float32x8)
741OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
742
743
744
745
746#define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin) \
747 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
748 { return _Tpvec(intrin(a.val, b.val)); } \
749 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
750 { a.val = intrin(a.val, b.val); return a; }
751
752OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32, __lasx_xvsadd_bu)
753OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32, __lasx_xvssub_bu)
754OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32, __lasx_xvsadd_b)
755OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32, __lasx_xvssub_b)
756OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
757OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
758OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16, __lasx_xvsadd_h)
759OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16, __lasx_xvssub_h)
760OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8, __lasx_xvadd_w)
761OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8, __lasx_xvsub_w)
762OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8, __lasx_xvmul_w)
763OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8, __lasx_xvadd_w)
764OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8, __lasx_xvsub_w)
765OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8, __lasx_xvmul_w)
766OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4, __lasx_xvadd_d)
767OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4, __lasx_xvsub_d)
768OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4, __lasx_xvadd_d)
769OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4, __lasx_xvsub_d)
770
771OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
772OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
773OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
774OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
775OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
776OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
777OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
778OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)
779
780// saturating multiply 8-bit, 16-bit
781inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
782{
783 v_uint16x16 c, d;
784 v_mul_expand(a, b, c, d);
785 return v_pack(c, d);
786}
787inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
788{
789 v_int16x16 c, d;
790 v_mul_expand(a, b, c, d);
791 return v_pack(c, d);
792}
793inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
794{
795 __m256i pl = __lasx_xvmul_h(a.val, b.val);
796 __m256i ph = __lasx_xvmuh_hu(a.val, b.val);
797 __m256i p0 = __lasx_xvilvl_h(ph, pl);
798 __m256i p1 = __lasx_xvilvh_h(ph, pl);
799 return v_uint16x16(_v256_packs_epu32(p0, p1));
800}
801inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
802{
803 __m256i pl = __lasx_xvmul_h(a.val, b.val);
804 __m256i ph = __lasx_xvmuh_h(a.val, b.val);
805 __m256i p0 = __lasx_xvilvl_h(ph, pl);
806 __m256i p1 = __lasx_xvilvh_h(ph, pl);
807 return v_int16x16(_lasx_packs_w(p0, p1));
808}
809inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
810{ a = a * b; return a; }
811inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
812{ a = a * b; return a; }
813inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
814{ a = a * b; return a; }
815inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
816{ a = a * b; return a; }
817
820#define OPENCV_HAL_IMPL_LASX_BIN_FUNC(func, _Tpvec, intrin) \
821 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
822 { return _Tpvec(intrin(a.val, b.val)); }
823
824OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint8x32, __lasx_xvadd_b)
825OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int8x32, __lasx_xvadd_b)
826OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint16x16, __lasx_xvadd_h)
827OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int16x16, __lasx_xvadd_h)
828OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint8x32, __lasx_xvsub_b)
829OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int8x32, __lasx_xvsub_b)
830OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint16x16, __lasx_xvsub_h)
831OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int16x16, __lasx_xvsub_h)
832OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_uint16x16, __lasx_xvmul_h)
833OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16, __lasx_xvmul_h)
834
835inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
836{
837 __m256i p0 = __lasx_xvmulwev_h_bu(a.val, b.val);
838 __m256i p1 = __lasx_xvmulwod_h_bu(a.val, b.val);
839 return v_uint8x32(__lasx_xvpackev_b(p1, p0));
840}
841
842inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
843{
844 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
845}
846
847// Multiply and expand
848inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
849 v_uint16x16& c, v_uint16x16& d)
850{
851 v_uint16x16 a0, a1, b0, b1;
852 v_expand(a, a0, a1);
853 v_expand(b, b0, b1);
854 c = v_mul_wrap(a0, b0);
855 d = v_mul_wrap(a1, b1);
856}
857
858inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
859 v_int16x16& c, v_int16x16& d)
860{
861 v_int16x16 a0, a1, b0, b1;
862 v_expand(a, a0, a1);
863 v_expand(b, b0, b1);
864 c = v_mul_wrap(a0, b0);
865 d = v_mul_wrap(a1, b1);
866}
867
868inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
869 v_int32x8& c, v_int32x8& d)
870{
871 v_int16x16 vhi = v_int16x16(__lasx_xvmuh_h(a.val, b.val));
872
873 v_int16x16 v0, v1;
874 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
875
876 c = v_reinterpret_as_s32(v0);
877 d = v_reinterpret_as_s32(v1);
878}
879
880inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
881 v_uint32x8& c, v_uint32x8& d)
882{
883 v_uint16x16 vhi = v_uint16x16(__lasx_xvmuh_hu(a.val, b.val));
884
885 v_uint16x16 v0, v1;
886 v_zip(v_mul_wrap(a, b), vhi, v0, v1);
887
888 c = v_reinterpret_as_u32(v0);
889 d = v_reinterpret_as_u32(v1);
890}
891
892inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
893 v_uint64x4& c, v_uint64x4& d)
894{
895 __m256i v0 = __lasx_xvmulwev_d_wu(a.val, b.val);
896 __m256i v1 = __lasx_xvmulwod_d_wu(a.val, b.val);
897 v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
898}
899
900inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(__lasx_xvmuh_h(a.val, b.val)); }
901inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(__lasx_xvmuh_hu(a.val, b.val)); }
902
904#define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
905 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
906 { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
907 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
908 { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
909 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
910 { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
911 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
912 { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
913 template<int imm> \
914 inline _Tpuvec v_shl(const _Tpuvec& a) \
915 { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
916 template<int imm> \
917 inline _Tpsvec v_shl(const _Tpsvec& a) \
918 { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
919 template<int imm> \
920 inline _Tpuvec v_shr(const _Tpuvec& a) \
921 { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
922 template<int imm> \
923 inline _Tpsvec v_shr(const _Tpsvec& a) \
924 { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }
925
926OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h)
927OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8, v_int32x8, w, __lasx_xvsra_w)
928OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d)
929
930
931
932#define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const) \
933 OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix) \
934 OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \
935 OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \
936 inline _Tpvec operator ~ (const _Tpvec& a) \
937 { return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
938
939OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1))
940OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32, v, __lasx_xvreplgr2vr_w(-1))
941OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint16x16, v, __lasx_xvreplgr2vr_w(-1))
942OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int16x16, v, __lasx_xvreplgr2vr_w(-1))
943OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint32x8, v, __lasx_xvreplgr2vr_w(-1))
944OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int32x8, v, __lasx_xvreplgr2vr_w(-1))
945OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4, v, __lasx_xvreplgr2vr_d(-1))
946OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4, v, __lasx_xvreplgr2vr_d(-1))
947
948#define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
949 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
950 { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } \
951 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
952 { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }
953
954#define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast) \
955 OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast) \
956 OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast) \
957 OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast) \
958 inline _Tpvec operator ~ (const _Tpvec& a) \
959 { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
960
961OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8, v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
962OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float64x4, v, __lasx_xvreplgr2vr_d(-1), _lasx_256_castsi256_pd)
963
964
965#define OPENCV_HAL_IMPL_LASX_SELECT(_Tpvec) \
966 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
967 { return _Tpvec(__lasx_xvbitsel_v(b.val, a.val, mask.val)); }
968
969OPENCV_HAL_IMPL_LASX_SELECT(v_uint8x32)
970OPENCV_HAL_IMPL_LASX_SELECT(v_int8x32)
971OPENCV_HAL_IMPL_LASX_SELECT(v_uint16x16)
972OPENCV_HAL_IMPL_LASX_SELECT(v_int16x16)
973OPENCV_HAL_IMPL_LASX_SELECT(v_uint32x8)
974OPENCV_HAL_IMPL_LASX_SELECT(v_int32x8)
975
976inline v_float32x8 v_select(const v_float32x8 &mask, const v_float32x8 &a, const v_float32x8 &b)
977{ return v_float32x8(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); }
978
979inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const v_float64x4 &b)
980{ return v_float64x4(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); }
981
983#define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec) \
984 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
985 { return ~(a == b); } \
986 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
987 { return b > a; } \
988 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
989 { return ~(a < b); } \
990 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
991 { return b >= a; }
992
993#define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
994 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
995 { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
996 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
997 { \
998 return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val)); \
999 } \
1000 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1001 { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
1002 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1003 { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); } \
1004 OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec) \
1005 OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
1006
1007OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint8x32, v_int8x32, b, bu)
1008OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
1009OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8, v_int32x8, w, wu)
1010
1011#define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix) \
1012 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1013 { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
1014 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1015 { return ~(a == b); }
1016
1017OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
1018OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
1019
1020#define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
1021 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1022 { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
1023
1024#define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix) \
1025 OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix) \
1026 OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix) \
1027 OPENCV_HAL_IMPL_LASX_CMP_FLT(<, xvfcmp_clt, _Tpvec, ssuffix) \
1028 OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)
1029
1030OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
1031OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
1032
1033inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b)
1034{ return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
1035
1036inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b)
1037{ return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
1038
1039inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b)
1040{ return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
1041
1042inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b)
1043{ return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
1044
1045inline v_float32x8 v_not_nan(const v_float32x8& a)
1046{ return v_float32x8(__lasx_xvfcmp_cor_s(a.val, a.val)); }
1047inline v_float64x4 v_not_nan(const v_float64x4& a)
1048{ return v_float64x4(__lasx_xvfcmp_cor_d(a.val, a.val)); }
1049
1051OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint8x32, __lasx_xvmin_bu)
1052OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint8x32, __lasx_xvmax_bu)
1053OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int8x32, __lasx_xvmin_b)
1054OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int8x32, __lasx_xvmax_b)
1055OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint16x16, __lasx_xvmin_hu)
1056OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint16x16, __lasx_xvmax_hu)
1057OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int16x16, __lasx_xvmin_h)
1058OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int16x16, __lasx_xvmax_h)
1059OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint32x8, __lasx_xvmin_wu)
1060OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint32x8, __lasx_xvmax_wu)
1061OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int32x8, __lasx_xvmin_w)
1062OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int32x8, __lasx_xvmax_w)
1063OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float32x8, __lasx_xvfmin_s)
1064OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float32x8, __lasx_xvfmax_s)
1065OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float64x4, __lasx_xvfmin_d)
1066OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float64x4, __lasx_xvfmax_d)
1067
1068
1069template<int imm>
1070inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
1071{
1072 enum {IMM_R = (16 - imm) & 0xFF};
1073 enum {IMM_R2 = (32 - imm) & 0xFF};
1074
1075 if (imm == 0) return a;
1076 if (imm == 32) return b;
1077 if (imm > 32) return v_uint8x32();
1078
1079 __m256i swap = _v256_permute2x128<0x21>(a.val, b.val);
1080 if (imm == 16) return v_uint8x32(swap);
1081 if (imm < 16) return v_uint8x32(_v256_alignr_b(a.val, swap, IMM_R));
1082 return v_uint8x32(_v256_alignr_b(swap, b.val, IMM_R2)); // imm < 32
1083}
1084
1085template<int imm>
1086inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
1087{
1088 enum {IMM_L = (imm - 16) & 0xFF};
1089
1090 if (imm == 0) return a;
1091 if (imm == 32) return b;
1092 if (imm > 32) return v_uint8x32();
1093
1094 __m256i swap = _v256_permute2x128<0x03>(a.val, b.val);
1095 if (imm == 16) return v_uint8x32(swap);
1096 if (imm < 16) return v_uint8x32(_v256_alignr_b(swap, a.val, imm));
1097 return v_uint8x32(_v256_alignr_b(b.val, swap, IMM_L));
1098}
1099
1100template<int imm>
1101inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
1102{
1103 enum {IMM_L = (imm - 16) & 0xFF};
1104 enum {IMM_R = (16 - imm) & 0xFF};
1105
1106 if (imm == 0) return a;
1107 if (imm > 32) return v_uint8x32();
1108
1109 // ESAC control[3] ? [127:0] = 0
1110 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1111 __m256i swapz = __lasx_xvpermi_q(a.val, vzero, 0x20);;
1112 if (imm == 16) return v_uint8x32(swapz);
1113 if (imm < 16) return v_uint8x32(_v256_alignr_b(a.val, swapz, IMM_R));
1114 return v_uint8x32(__lasx_xvbsll_v(swapz, IMM_L));
1115}
1116
1117template<int imm>
1118inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
1119{
1120 enum {IMM_L = (imm - 16) & 0xFF};
1121
1122 if (imm == 0) return a;
1123 if (imm > 32) return v_uint8x32();
1124
1125 // ESAC control[3] ? [127:0] = 0
1126 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1127 __m256i swapz = __lasx_xvpermi_q(vzero, a.val, 0x21);;
1128 if (imm == 16) return v_uint8x32(swapz);
1129 if (imm < 16) return v_uint8x32(_v256_alignr_b(swapz, a.val, imm));
1130 return v_uint8x32(__lasx_xvbsrl_v(swapz, IMM_L));
1131}
1132
1133#define OPENCV_HAL_IMPL_LASX_ROTATE_CAST(intrin, _Tpvec, cast) \
1134 template<int imm> \
1135 inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
1136 { \
1137 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1138 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a), \
1139 v_reinterpret_as_u8(b)); \
1140 return _Tpvec(cast(ret.val)); \
1141 } \
1142 template<int imm> \
1143 inline _Tpvec intrin(const _Tpvec& a) \
1144 { \
1145 enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1146 v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a)); \
1147 return _Tpvec(cast(ret.val)); \
1148 }
1149
1150#define OPENCV_HAL_IMPL_LASX_ROTATE(_Tpvec) \
1151 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
1152 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
1153
1154OPENCV_HAL_IMPL_LASX_ROTATE(v_int8x32)
1155OPENCV_HAL_IMPL_LASX_ROTATE(v_uint16x16)
1156OPENCV_HAL_IMPL_LASX_ROTATE(v_int16x16)
1157OPENCV_HAL_IMPL_LASX_ROTATE(v_uint32x8)
1158OPENCV_HAL_IMPL_LASX_ROTATE(v_int32x8)
1159OPENCV_HAL_IMPL_LASX_ROTATE(v_uint64x4)
1160OPENCV_HAL_IMPL_LASX_ROTATE(v_int64x4)
1161
1162OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, v_float32x8, _lasx_256_castsi256_ps)
1163OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float32x8, _lasx_256_castsi256_ps)
1164OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, v_float64x4, _lasx_256_castsi256_pd)
1165OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float64x4, _lasx_256_castsi256_pd)
1166
1167
1168inline v_uint8x32 v_reverse(const v_uint8x32 &a)
1169{
1170 static const __m256i perm = _v256_setr_b(
1171 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
1172 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1173 __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
1174 return v_uint8x32(__lasx_xvpermi_q(vec, vec, 1));
1175}
1176
1177inline v_int8x32 v_reverse(const v_int8x32 &a)
1178{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1179
1180inline v_uint16x16 v_reverse(const v_uint16x16 &a)
1181{
1182 __m256i vec = __lasx_xvshuf4i_h(a.val, 0x1B);
1183 vec = __lasx_xvshuf4i_w(vec, 0x4E);
1184 return v_uint16x16(__lasx_xvpermi_d(vec, 0x4E));
1185}
1186
1187inline v_int16x16 v_reverse(const v_int16x16 &a)
1188{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1189
1190inline v_uint32x8 v_reverse(const v_uint32x8 &a)
1191{
1192 __m256i vec = __lasx_xvshuf4i_w(a.val, 0x1B);
1193 return v_uint32x8(__lasx_xvpermi_d(vec, 0x4E));
1194}
1195
1196inline v_int32x8 v_reverse(const v_int32x8 &a)
1197{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1198
1199inline v_float32x8 v_reverse(const v_float32x8 &a)
1200{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1201
1202inline v_uint64x4 v_reverse(const v_uint64x4 &a)
1203{
1204 return v_uint64x4(__lasx_xvpermi_d(a.val, 0x1b));
1205}
1206
1207inline v_int64x4 v_reverse(const v_int64x4 &a)
1208{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1209
1210inline v_float64x4 v_reverse(const v_float64x4 &a)
1211{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1212
1214
1216// this function is return a[0]+a[1]+...+a[31]
1217inline unsigned v_reduce_sum(const v_uint8x32& a)
1218{
1219 __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val);
1220 __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1221 __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1222 __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1223 return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1224}
1225
1226inline int v_reduce_sum(const v_int8x32& a)
1227{
1228 __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val);
1229 __m256i t2 = __lasx_xvhaddw_w_h(t1, t1);
1230 __m256i t3 = __lasx_xvhaddw_d_w(t2, t2);
1231 __m256i t4 = __lasx_xvhaddw_q_d(t3, t3);
1232 return (int)(((v8i32)t4)[0]+((v8i32)t4)[4]);
1233}
1234
1235#define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \
1236 inline sctype v_reduce_##func(const _Tpvec& a) \
1237 { \
1238 __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
1239 val = intrin(val, __lsx_vbsrl_v(val,8)); \
1240 val = intrin(val, __lsx_vbsrl_v(val,4)); \
1241 val = intrin(val, __lsx_vbsrl_v(val,2)); \
1242 val = intrin(val, __lsx_vbsrl_v(val,1)); \
1243 return (sctype)__lsx_vpickve2gr_w(val, 0); \
1244 }
1245
1246OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, min, __lsx_vmin_bu)
1247OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32, schar, min, __lsx_vmin_b)
1248OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, max, __lsx_vmax_bu)
1249OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32, schar, max, __lsx_vmax_b)
1250
1251#define OPENCV_HAL_IMPL_LASX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1252 inline sctype v_reduce_##func(const _Tpvec& a) \
1253 { \
1254 __m128i v0 = _v256_extract_low(a.val); \
1255 __m128i v1 = _v256_extract_high(a.val); \
1256 v0 = intrin(v0, v1); \
1257 v0 = intrin(v0, __lsx_vbsrl_v(v0, 8)); \
1258 v0 = intrin(v0, __lsx_vbsrl_v(v0, 4)); \
1259 v0 = intrin(v0, __lsx_vbsrl_v(v0, 2)); \
1260 return (sctype) __lsx_vpickve2gr_w(v0, 0); \
1261 }
1262
1263OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, min, __lsx_vmin_hu)
1264OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16, short, min, __lsx_vmin_h)
1265OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, max, __lsx_vmax_hu)
1266OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16, short, max, __lsx_vmax_h)
1267
1268#define OPENCV_HAL_IMPL_LASX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1269 inline sctype v_reduce_##func(const _Tpvec& a) \
1270 { \
1271 __m128i v0 = _v256_extract_low(a.val); \
1272 __m128i v1 = _v256_extract_high(a.val); \
1273 v0 = intrin(v0, v1); \
1274 v0 = intrin(v0, __lsx_vbsrl_v(v0, 8)); \
1275 v0 = intrin(v0, __lsx_vbsrl_v(v0, 4)); \
1276 return (sctype) __lsx_vpickve2gr_w(v0, 0); \
1277 }
1278
1279OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, min, __lsx_vmin_wu)
1280OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8, int, min, __lsx_vmin_w)
1281OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, max, __lsx_vmax_wu)
1282OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8, int, max, __lsx_vmax_w)
1283
1284#define OPENCV_HAL_IMPL_LASX_REDUCE_FLT(func, intrin) \
1285 inline float v_reduce_##func(const v_float32x8& a) \
1286 { \
1287 __m128 v0 = _v256_extract_low(a.val); \
1288 __m128 v1 = _v256_extract_high(a.val); \
1289 v0 = intrin(v0, v1); \
1290 v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x0e))); \
1291 v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x01))); \
1292 float *fvalue = (float*)&v0; \
1293 return fvalue[0]; \
1294 }
1295
1296OPENCV_HAL_IMPL_LASX_REDUCE_FLT(min, __lsx_vfmin_s)
1297OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s)
1298
1299inline int v_reduce_sum(const v_int32x8& a)
1300{
1301 __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val);
1302 __m256i t2 = __lasx_xvhaddw_q_d(t1, t1);
1303 return (int)(((v8i32)t2)[0]+((v8i32)t2)[4]);
1304}
1305
1306inline unsigned v_reduce_sum(const v_uint32x8& a)
1307{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
1308
1309inline int v_reduce_sum(const v_int16x16& a)
1310{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1311inline unsigned v_reduce_sum(const v_uint16x16& a)
1312{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1313
1314inline float v_reduce_sum(const v_float32x8& a)
1315{
1316 float result = 0;
1317 float *pa = (float*)&a;
1318 for (int i = 0; i < 2; ++i) {
1319 result += pa[i*4] + pa[i*4+1] + pa[i*4+2] + pa[i*4+3];
1320 }
1321 return result;
1322}
1323
1324inline uint64 v_reduce_sum(const v_uint64x4& a)
1325{
1326 __m256i t0 = __lasx_xvhaddw_qu_du(a.val, a.val);
1327 return (uint64)(((v4u64)t0)[0] + ((v4u64)t0)[2]);
1328}
1329inline int64 v_reduce_sum(const v_int64x4& a)
1330{
1331 __m256i t0 = __lasx_xvhaddw_q_d(a.val, a.val);
1332 return (int64)(((v4i64)t0)[0] + ((v4i64)t0)[2]);
1333}
1334inline double v_reduce_sum(const v_float64x4& a)
1335{
1336 double *pa = (double*)&a;
1337 return pa[0] + pa[1] + pa[2] + pa[3];
1338}
1339
1340inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
1341 const v_float32x8& c, const v_float32x8& d)
1342{
1343 float *pa = (float*)&a;
1344 float *pb = (float*)&b;
1345 float *pc = (float*)&c;
1346 float *pd = (float*)&d;
1347
1348 float v0 = pa[0] + pa[1] + pa[2] + pa[3];
1349 float v1 = pb[0] + pb[1] + pb[2] + pb[3];
1350 float v2 = pc[0] + pc[1] + pc[2] + pc[3];
1351 float v3 = pd[0] + pd[1] + pd[2] + pd[3];
1352 float v4 = pa[4] + pa[5] + pa[6] + pa[7];
1353 float v5 = pb[4] + pb[5] + pb[6] + pb[7];
1354 float v6 = pc[4] + pc[5] + pc[6] + pc[7];
1355 float v7 = pd[4] + pd[5] + pd[6] + pd[7];
1356 return v_float32x8(v0, v1, v2, v3, v4, v5, v6, v7);
1357}
1358
1359inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
1360{
1361 __m256i t0 = __lasx_xvabsd_bu(a.val, b.val);
1362 __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
1363 __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1364 __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1365 __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1366 return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1367}
1368inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
1369{
1370 __m256i t0 = __lasx_xvabsd_b(a.val, b.val);
1371 __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
1372 __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1373 __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1374 __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1375 return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1376}
1377inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
1378{
1379 v_uint32x8 l, h;
1380 v_expand(v_add_wrap(a - b, b - a), l, h);
1381 return v_reduce_sum(l + h);
1382}
1383inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
1384{
1385 v_uint32x8 l, h;
1386 v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
1387 return v_reduce_sum(l + h);
1388}
1389inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
1390{
1391 return v_reduce_sum(v_max(a, b) - v_min(a, b));
1392}
1393inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
1394{
1395 v_int32x8 m = a < b;
1396 return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
1397}
1398inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
1399{
1400 v_float32x8 a_b = a - b;
1401 return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
1402}
1403
1405inline v_uint8x32 v_popcount(const v_uint8x32& a)
1406{ return v_uint8x32(__lasx_xvpcnt_b(a.val)); }
1407inline v_uint16x16 v_popcount(const v_uint16x16& a)
1408{ return v_uint16x16(__lasx_xvpcnt_h(a.val)); }
1409inline v_uint32x8 v_popcount(const v_uint32x8& a)
1410{ return v_uint32x8(__lasx_xvpcnt_w(a.val)); }
1411inline v_uint64x4 v_popcount(const v_uint64x4& a)
1412{ return v_uint64x4(__lasx_xvpcnt_d(a.val)); }
1413inline v_uint8x32 v_popcount(const v_int8x32& a)
1414{ return v_popcount(v_reinterpret_as_u8(a)); }
1415inline v_uint16x16 v_popcount(const v_int16x16& a)
1416{ return v_popcount(v_reinterpret_as_u16(a)); }
1417inline v_uint32x8 v_popcount(const v_int32x8& a)
1418{ return v_popcount(v_reinterpret_as_u32(a)); }
1419inline v_uint64x4 v_popcount(const v_int64x4& a)
1420{ return v_popcount(v_reinterpret_as_u64(a)); }
1421
1422inline int v_signmask(const v_int8x32& a)
1423{
1424 __m256i result = __lasx_xvmskltz_b(a.val);
1425 int mask = __lasx_xvpickve2gr_w(result, 0);
1426 mask |= (__lasx_xvpickve2gr_w(result, 4) << 16);
1427 return mask;
1428}
1429inline int v_signmask(const v_uint8x32& a)
1430{ return v_signmask(v_reinterpret_as_s8(a)); }
1431
1432inline int v_signmask(const v_int16x16& a)
1433{ return v_signmask(v_pack(a, a)) & 0xFFFF; }
1434inline int v_signmask(const v_uint16x16& a)
1435{ return v_signmask(v_reinterpret_as_s16(a)); }
1436
1437inline int v_signmask(const v_int32x8& a)
1438{
1439 __m256i result = __lasx_xvmskltz_w(a.val);
1440 int mask = __lasx_xvpickve2gr_w(result, 0);
1441 mask |= (__lasx_xvpickve2gr_w(result, 4) << 4);
1442 return mask;
1443}
1444inline int v_signmask(const v_uint32x8& a)
1445{ return v_signmask(*(v_int32x8*)(&a)); }
1446
1447inline int v_signmask(const v_int64x4& a)
1448{
1449 __m256i result = __lasx_xvmskltz_d(a.val);
1450 int mask = __lasx_xvpickve2gr_d(result, 0);
1451 mask |= (__lasx_xvpickve2gr_w(result, 4) << 2);
1452 return mask;
1453}
1454inline int v_signmask(const v_uint64x4& a)
1455{ return v_signmask(v_reinterpret_as_s64(a)); }
1456
1457inline int v_signmask(const v_float32x8& a)
1458{ return v_signmask(*(v_int32x8*)(&a)); }
1459
1460inline int v_signmask(const v_float64x4& a)
1461{ return v_signmask(*(v_int64x4*)(&a)); }
1462
1463inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1464inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1465inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1466inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1467inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1468inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1469inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1470inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1471inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1472inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1473
1475#define OPENCV_HAL_IMPL_LASX_CHECK(_Tpvec, allmask) \
1476 inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1477 inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1478OPENCV_HAL_IMPL_LASX_CHECK(v_uint8x32, -1)
1479OPENCV_HAL_IMPL_LASX_CHECK(v_int8x32, -1)
1480OPENCV_HAL_IMPL_LASX_CHECK(v_uint32x8, 255)
1481OPENCV_HAL_IMPL_LASX_CHECK(v_int32x8, 255)
1482OPENCV_HAL_IMPL_LASX_CHECK(v_uint64x4, 15)
1483OPENCV_HAL_IMPL_LASX_CHECK(v_int64x4, 15)
1484OPENCV_HAL_IMPL_LASX_CHECK(v_float32x8, 255)
1485OPENCV_HAL_IMPL_LASX_CHECK(v_float64x4, 15)
1486
1487#define OPENCV_HAL_IMPL_LASX_CHECK_SHORT(_Tpvec) \
1488 inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
1489 inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
1490OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_uint16x16)
1491OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
1492
1493
1494
1495
1496#define OPENCV_HAL_IMPL_LASX_MULADD(_Tpvec, suffix) \
1497 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1498 { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); } \
1499 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1500 { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); } \
1501 inline _Tpvec v_sqrt(const _Tpvec& x) \
1502 { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); } \
1503 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1504 { return v_fma(a, a, b * b); } \
1505 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1506 { return v_sqrt(v_fma(a, a, b*b)); }
1507
1508OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
1509OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
1510
1511inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
1512{
1513 return v_int32x8(__lasx_xvmadd_w(c.val, a.val, b.val));
1514}
1515
1516inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
1517{
1518 return v_fma(a, b, c);
1519}
1520
1521inline v_float32x8 v_invsqrt(const v_float32x8& x)
1522{ return v_float32x8(__lasx_xvfrsqrt_s(x.val)); }
1523
1524inline v_float64x4 v_invsqrt(const v_float64x4& x)
1525{ return v_float64x4(__lasx_xvfrsqrt_d(x.val)); }
1526
1528#define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix) \
1529 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1530 { return v_u##_Tpvec(__lasx_xvabsd_##suffix(x.val, __lasx_xvreplgr2vr_w(0))); }
1531
1532OPENCV_HAL_IMPL_LASX_ABS(int8x32, b)
1533OPENCV_HAL_IMPL_LASX_ABS(int16x16, h)
1534OPENCV_HAL_IMPL_LASX_ABS(int32x8, w)
1535
1536inline v_float32x8 v_abs(const v_float32x8& x)
1537{ return v_float32x8(*((__m256i*)&x) & __lasx_xvreplgr2vr_w(0x7fffffff)); }
1538inline v_float64x4 v_abs(const v_float64x4& x)
1539{ return v_float64x4(*((__m256i*)&x) & __lasx_xvreplgr2vr_d(0x7fffffffffffffff)); }
1540
1542inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
1543{ return (v_uint8x32)__lasx_xvabsd_bu(a.val, b.val); }
1544inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
1545{ return (v_uint16x16)__lasx_xvabsd_hu(a.val, b.val); }
1546inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
1547{ return (v_uint32x8)__lasx_xvabsd_wu(a.val, b.val); }
1548
1549inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
1550{ return (v_uint8x32)__lasx_xvabsd_b(a.val, b.val); }
1551inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
1552{ return (v_uint16x16)__lasx_xvabsd_h(a.val, b.val); }
1553inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
1554{ return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
1555
1556inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
1557{ return v_abs(a - b); }
1558
1559inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
1560{ return v_abs(a - b); }
1561
1563inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
1564{
1565 v_int8x32 d = a - b;
1566 v_int8x32 m = a < b;
1567 return (d ^ m) - m;
1568}
1569inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
1570{ return v_max(a, b) - v_min(a, b); }
1571
1573
1575inline v_int32x8 v_round(const v_float32x8& a)
1576{ return v_int32x8(__lasx_xvftint_w_s(a.val)); }
1577
1578inline v_int32x8 v_round(const v_float64x4& a)
1579{ __m256i t = __lasx_xvftint_w_d(a.val, a.val);
1580 return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
1581
1582inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
1583{
1584 __m256i abi = __lasx_xvftint_w_d(b.val, a.val);
1585 return v_int32x8(__lasx_xvpermi_d(abi, 0b11011000)); //3120
1586}
1587
1588inline v_int32x8 v_trunc(const v_float32x8& a)
1589{ return v_int32x8(__lasx_xvftintrz_w_s(a.val)); }
1590
1591inline v_int32x8 v_trunc(const v_float64x4& a)
1592{ __m256i t = __lasx_xvftintrz_w_d(a.val, a.val);
1593 return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
1594
1595inline v_int32x8 v_floor(const v_float32x8& a)
1596{ return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrm_s(a.val)))); }
1597
1598inline v_int32x8 v_floor(const v_float64x4& a)
1599{ return v_trunc(v_float64x4(__lasx_xvfrintrm_d(a.val))); }
1600
1601inline v_int32x8 v_ceil(const v_float32x8& a)
1602{ return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrp_s(a.val)))); }
1603
1604inline v_int32x8 v_ceil(const v_float64x4& a)
1605{ return v_trunc(v_float64x4(__lasx_xvfrintrp_d(a.val))); }
1606
1608inline v_float32x8 v_cvt_f32(const v_int32x8& a)
1609{ return v_float32x8(__lasx_xvffint_s_w(a.val)); }
1610
1611inline v_float32x8 v_cvt_f32(const v_float64x4& a)
1612{ return v_float32x8(__lasx_xvpermi_d(__lasx_xvfcvt_s_d(a.val, a.val), 0x88)); }
1613
1614inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
1615{
1616 __m256 abf = __lasx_xvfcvt_s_d(a.val, b.val); //warnning: order of a,b is diff from instruction xvfcvt.s.d
1617 return v_float32x8(__lasx_xvpermi_d(abf, 0x8D));
1618}
1619
1620inline v_float64x4 v_cvt_f64(const v_int32x8& a)
1621{
1622 __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
1623 return v_float64x4(__lasx_xvffintl_d_w(alow));
1624}
1625
1626inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
1627{
1628 __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
1629 return v_float64x4(__lasx_xvffintl_d_w(ahigh));
1630}
1631
1632inline v_float64x4 v_cvt_f64(const v_float32x8& a)
1633{
1634 __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
1635 return v_float64x4(__lasx_xvfcvtl_d_s((__m256)alow));
1636}
1637
1638inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
1639{
1640 __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
1641 return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh));
1642}
1643
1644inline v_float64x4 v_cvt_f64(const v_int64x4& v)
1645{ return v_float64x4(__lasx_xvffint_d_l(v.val)); }
1646
1648
1649inline v_int8x32 v256_lut(const schar* tab, const int* idx)
1650{
1651 return v_int8x32(_v256_setr_b(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]],
1652 tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]],
1653 tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]], tab[idx[16]], tab[idx[17]],
1654 tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
1655 tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]],
1656 tab[idx[30]], tab[idx[31]]));
1657}
1658inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
1659{
1660 return v_int8x32(_v256_setr_h(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]),
1661 *(const short*)(tab + idx[ 3]), *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]),
1662 *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]), *(const short*)(tab + idx[ 8]),
1663 *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
1664 *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]),
1665 *(const short*)(tab + idx[15])));
1666}
1667inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
1668{
1669 return v_int8x32(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
1670 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
1671 *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
1672 *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7])));
1673}
1674inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
1675inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
1676inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
1677
1678inline v_int16x16 v256_lut(const short* tab, const int* idx)
1679{
1680 return v_int16x16(_v256_setr_h(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]],
1681 tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]],
1682 tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]],
1683 tab[idx[15]]));
1684}
1685inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
1686{
1687 return v_int16x16(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
1688 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
1689 *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
1690 *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) ));
1691}
1692inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
1693{
1694 return v_int16x16(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
1695 *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
1696
1697}
1698inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
1699inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
1700inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
1701
1702inline v_int32x8 v256_lut(const int* tab, const int* idx)
1703{
1704 return v_int32x8(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
1705 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
1706 *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
1707 *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) ));
1708}
1709inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
1710{
1711 return v_int32x8(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
1712 *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
1713}
1714inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
1715{
1716 return v_int32x8(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0)));
1717}
1718inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
1719inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
1720inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
1721
1722inline v_int64x4 v256_lut(const int64* tab, const int* idx)
1723{
1724 return v_int64x4(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
1725 *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
1726}
1727inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
1728{
1729 return v_int64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0)));
1730}
1731inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
1732inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
1733
1734inline v_float32x8 v256_lut(const float* tab, const int* idx)
1735{
1736 return v_float32x8(_v256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1737 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
1738}
1739inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
1740inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
1741
1742inline v_float64x4 v256_lut(const double* tab, const int* idx)
1743{
1744 return v_float64x4(_v256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
1745}
1746inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx)
1747{ return v_float64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0))); }
1748
1749inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
1750{
1751 int *idx = (int*)&idxvec.val;
1752 return v256_lut(tab, idx);
1753}
1754
1755inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
1756{
1757 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
1758}
1759
1760inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
1761{
1762 const int *idx = (const int*)&idxvec.val;
1763 return v256_lut(tab, idx);
1764}
1765
1766inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
1767{
1768 const int *idx = (const int*)&idxvec.val;
1769 return v256_lut(tab, idx);
1770}
1771
1772inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
1773{
1774 const int *idx = (const int*)&idxvec.val;
1775 __m128i xy01, xy45, xy23, xy67;
1776 xy01 = __lsx_vld(tab + idx[0], 0);
1777 xy01 = __lsx_vextrins_d(xy01, __lsx_vld(tab + idx[1], 0), 0x10);
1778 xy45 = __lsx_vld(tab + idx[4], 0);
1779 xy45 = __lsx_vextrins_d(xy45, __lsx_vld(tab + idx[5], 0), 0x10);
1780 __m256i xy0145 = _v256_combine(xy01, xy45);
1781 xy23 = __lsx_vld(tab + idx[2], 0);
1782 xy23 = __lsx_vextrins_d(xy23, __lsx_vld(tab + idx[3], 0), 0x10);
1783 xy67 = __lsx_vld(tab + idx[6], 0);
1784 xy67 = __lsx_vextrins_d(xy67, __lsx_vld(tab + idx[7], 0), 0x10);
1785 __m256i xy2367 = _v256_combine(xy23, xy67);
1786
1787 __m256i xxyy0145 = __lasx_xvilvl_w(xy2367, xy0145);
1788 __m256i xxyy2367 = __lasx_xvilvh_w(xy2367, xy0145);
1789
1790 x = v_float32x8(__lasx_xvilvl_w(xxyy2367, xxyy0145));
1791 y = v_float32x8(__lasx_xvilvh_w(xxyy2367, xxyy0145));
1792}
1793
1794inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
1795{
1796 //int CV_DECL_ALIGNED(32) idx[4];
1797 const int *idx = (const int*)&idxvec.val;
1798 __m128i xy0 = __lsx_vld(tab + idx[0], 0);
1799 __m128i xy2 = __lsx_vld(tab + idx[2], 0);
1800 __m128i xy1 = __lsx_vld(tab + idx[1], 0);
1801 __m128i xy3 = __lsx_vld(tab + idx[3], 0);
1802 __m256i xy02 = _v256_combine(xy0, xy2);
1803 __m256i xy13 = _v256_combine(xy1, xy3);
1804
1805 x = v_float64x4(__lasx_xvilvl_d(xy13, xy02));
1806 y = v_float64x4(__lasx_xvilvh_d(xy13, xy02));
1807}
1808
1809inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
1810{
1811 return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
1812 _v256_set_d(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
1813}
1814inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec)
1815{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1816inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
1817{
1818 return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
1819 _v256_set_d(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
1820}
1821inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec)
1822{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1823
1824inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
1825{
1826 return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
1827 _v256_set_d(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
1828}
1829inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec)
1830{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1831inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
1832{
1833 return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
1834 _v256_set_d(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
1835}
1836inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec)
1837{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1838
1839inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
1840{
1841 return v_int32x8(__lasx_xvshuf4i_w(vec.val, 0xd8));
1842}
1843inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec)
1844{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1845inline v_float32x8 v_interleave_pairs(const v_float32x8& vec)
1846{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1847
1848inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
1849{
1850 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1851 __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
1852 _v256_set_d(0x1211100f0e0d0c0a, 0x0908060504020100, 0x1211100f0e0d0c0a, 0x0908060504020100));
1853 return v_int8x32(__lasx_xvperm_w(t1,
1854 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1855}
1856inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
1857{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1858
1859inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
1860{
1861 __m256i vzero = __lasx_xvreplgr2vr_w(0);
1862 __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
1863 _v256_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100, 0x11100f0e0d0c0b0a, 0x0908050403020100));
1864 return v_int16x16(__lasx_xvperm_w(t1,
1865 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1866}
1867inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec)
1868{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1869
1870inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
1871{
1872 return v_int32x8(__lasx_xvperm_w(vec.val,
1873 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1874}
1875inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec)
1876{ return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
1877inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
1878{
1879 return v_float32x8(__lasx_xvperm_w(*(__m256i*)(&vec.val),
1880 _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1881}
1882
1884
1886
1887// 16 >> 32
1888inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
1889{ return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
1890
1891inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1892{ return v_dotprod(a, b) + c; }
1893
1894// 32 >> 64
1895inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
1896{
1897 __m256i even = __lasx_xvmulwev_d_w(a.val, b.val);
1898 return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
1899}
1900inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
1901{
1902 __m256i even = __lasx_xvmaddwev_d_w(c.val, a.val, b.val);
1903 return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
1904}
1905
1906// 8 >> 32
1907inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
1908{
1909 __m256i even = __lasx_xvmulwev_h_bu(a.val, b.val);
1910 __m256i odd = __lasx_xvmulwod_h_bu(a.val, b.val);
1911 __m256i prod0 = __lasx_xvhaddw_wu_hu(even, even);
1912 __m256i prod1 = __lasx_xvhaddw_wu_hu(odd, odd);
1913 return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
1914}
1915inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
1916{ return v_dotprod_expand(a, b) + c; }
1917
1918inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
1919{
1920 __m256i even = __lasx_xvmulwev_h_b(a.val, b.val);
1921 __m256i odd = __lasx_xvmulwod_h_b(a.val, b.val);
1922 __m256i prod0 = __lasx_xvhaddw_w_h(even, even);
1923 __m256i prod1 = __lasx_xvhaddw_w_h(odd, odd);
1924 return v_int32x8(__lasx_xvadd_w(prod0, prod1));
1925}
1926inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
1927{ return v_dotprod_expand(a, b) + c; }
1928
1929// 16 >> 64
1930inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
1931{
1932 __m256i even = __lasx_xvmulwev_w_hu(a.val, b.val);
1933 __m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val);
1934 __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
1935 __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
1936 return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
1937}
1938inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
1939{ return v_dotprod_expand(a, b) + c; }
1940
1941inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
1942{
1943 __m256i even = __lasx_xvmulwev_w_h(a.val, b.val);
1944 __m256i odd = __lasx_xvmulwod_w_h(a.val, b.val);
1945 __m256i prod0 = __lasx_xvhaddw_d_w(even, even);
1946 __m256i prod1 = __lasx_xvhaddw_d_w(odd, odd);
1947 return v_int64x4(__lasx_xvadd_d(prod0, prod1));
1948}
1949
1950inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
1951{ return v_dotprod_expand(a, b) + c; }
1952
1953// 32 >> 64f
1954inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
1955{ return v_cvt_f64(v_dotprod(a, b)); }
1956inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
1957{ return v_dotprod_expand(a, b) + c; }
1958
1960
1961// 16 >> 32
1962inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
1963{ return v_dotprod(a, b); }
1964inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1965{ return v_dotprod(a, b, c); }
1966
1967// 32 >> 64
1968inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
1969{ return v_dotprod(a, b); }
1970inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
1971{ return v_dotprod(a, b, c); }
1972
1973// 8 >> 32
1974inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
1975{ return v_dotprod_expand(a, b); }
1976inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
1977{ return v_dotprod_expand(a, b, c); }
1978
1979inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
1980{ return v_dotprod_expand(a, b); }
1981inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
1982{ return v_dotprod_expand(a, b, c); }
1983
1984// 16 >> 64
1985inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
1986{
1987 __m256i even = __lasx_xvmulwev_w_hu(a.val, b.val);
1988 __m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val);
1989 __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
1990 __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
1991 return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
1992}
1993inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
1994{ return v_dotprod_expand_fast(a, b) + c; }
1995
1996inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
1997{
1998 __m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
1999 __m256i sign = __lasx_xvsrai_w(prod, 31);
2000 __m256i lo = __lasx_xvilvl_w(sign, prod);
2001 __m256i hi = __lasx_xvilvh_w(sign, prod);
2002 return v_int64x4(__lasx_xvadd_d(lo, hi));
2003}
2004inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
2005{ return v_dotprod_expand_fast(a, b) + c; }
2006
2007// 32 >> 64f
2008inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
2009{ return v_dotprod_expand(a, b); }
2010inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
2011{ return v_dotprod_expand(a, b, c); }
2012
2013
2014#define OPENCV_HAL_LASX_SPLAT2_PS(a, im) \
2015 v_float32x8(__lasx_xvpermi_w(a.val, a.val, im))
2016
2017inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
2018 const v_float32x8& m1, const v_float32x8& m2,
2019 const v_float32x8& m3)
2020{
2021 v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
2022 v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
2023 v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
2024 v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
2025 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
2026}
2027
2028inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
2029 const v_float32x8& m1, const v_float32x8& m2,
2030 const v_float32x8& a)
2031{
2032 v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
2033 v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
2034 v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
2035 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
2036}
2037
2038
2039#define OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(_Tpvec, cast_from, cast_to) \
2040 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2041 const _Tpvec& a2, const _Tpvec& a3, \
2042 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
2043 { \
2044 __m256i t0 = cast_from(__lasx_xvilvl_w(a1.val, a0.val)); \
2045 __m256i t1 = cast_from(__lasx_xvilvl_w(a3.val, a2.val)); \
2046 __m256i t2 = cast_from(__lasx_xvilvh_w(a1.val, a0.val)); \
2047 __m256i t3 = cast_from(__lasx_xvilvh_w(a3.val, a2.val)); \
2048 b0.val = cast_to(__lasx_xvilvl_d(t1, t0)); \
2049 b1.val = cast_to(__lasx_xvilvh_d(t1, t0)); \
2050 b2.val = cast_to(__lasx_xvilvl_d(t3, t2)); \
2051 b3.val = cast_to(__lasx_xvilvh_d(t3, t2)); \
2052 }
2053
2054OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_uint32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2055OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_int32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2056
2057inline void v_transpose4x4(const v_float32x8 &a0, const v_float32x8 &a1,
2058 const v_float32x8 &a2, const v_float32x8 &a3,
2059 v_float32x8 &b0, v_float32x8 &b1, v_float32x8 &b2, v_float32x8 &b3)
2060{
2061 __m256i t0 = __lasx_xvilvl_w(__m256i(a1.val), __m256i(a0.val));
2062 __m256i t1 = __lasx_xvilvl_w(__m256i(a3.val), __m256i(a2.val));
2063 __m256i t2 = __lasx_xvilvh_w(__m256i(a1.val), __m256i(a0.val));
2064 __m256i t3 = __lasx_xvilvh_w(__m256i(a3.val), __m256i(a2.val));
2065 b0.val = __m256(__lasx_xvilvl_d(t1, t0));
2066 b1.val = __m256(__lasx_xvilvh_d(t1, t0));
2067 b2.val = __m256(__lasx_xvilvl_d(t3, t2));
2068 b3.val = __m256(__lasx_xvilvh_d(t3, t2));
2069}
2070
2072
2073/* Expand */
2074#define OPENCV_HAL_IMPL_LASX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
2075 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2076 { \
2077 b0.val = intrin(a.val); \
2078 b1.val = intrin(__lasx_xvpermi_q(a.val, a.val, 0x11)); \
2079 } \
2080 inline _Tpwvec v_expand_low(const _Tpvec& a) \
2081 { return _Tpwvec(intrin(a.val)); } \
2082 inline _Tpwvec v_expand_high(const _Tpvec& a) \
2083 { return _Tpwvec(intrin(__lasx_xvpermi_q(a.val, a.val, 0x11))); } \
2084 inline _Tpwvec v256_load_expand(const _Tp* ptr) \
2085 { \
2086 __m128i a = __lsx_vld(ptr, 0); \
2087 return _Tpwvec(intrin(*((__m256i*)&a))); \
2088 }
2089
2090OPENCV_HAL_IMPL_LASX_EXPAND(v_uint8x32, v_uint16x16, uchar, __lasx_vext2xv_hu_bu)
2091OPENCV_HAL_IMPL_LASX_EXPAND(v_int8x32, v_int16x16, schar, __lasx_vext2xv_h_b)
2092OPENCV_HAL_IMPL_LASX_EXPAND(v_uint16x16, v_uint32x8, ushort, __lasx_vext2xv_wu_hu)
2093OPENCV_HAL_IMPL_LASX_EXPAND(v_int16x16, v_int32x8, short, __lasx_vext2xv_w_h)
2094OPENCV_HAL_IMPL_LASX_EXPAND(v_uint32x8, v_uint64x4, unsigned, __lasx_vext2xv_du_wu)
2095OPENCV_HAL_IMPL_LASX_EXPAND(v_int32x8, v_int64x4, int, __lasx_vext2xv_d_w)
2096
2097#define OPENCV_HAL_IMPL_LASX_EXPAND_Q(_Tpvec, _Tp, intrin) \
2098 inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
2099 { \
2100 __m128i a = __lsx_vld(ptr, 0); \
2101 return _Tpvec(intrin(*((__m256i*)&a))); \
2102 }
2103
2104OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_uint32x8, uchar, __lasx_vext2xv_wu_bu)
2105OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_int32x8, schar, __lasx_vext2xv_w_b)
2106
2107/* pack */
2108// 16
2109inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
2110{ return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); }
2111
2112inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
2113{ return v_uint8x32(_v256_shuffle_odd_64(__lasx_xvssrlrni_bu_h(b.val, a.val, 0))); }
2114
2115inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
2116{
2117 return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a.val, b.val)));
2118}
2119
2120inline void v_pack_store(schar* ptr, const v_int16x16& a)
2121{ v_store_low(ptr, v_pack(a, a)); }
2122
2123inline void v_pack_store(uchar *ptr, const v_uint16x16& a)
2124{ v_store_low(ptr, v_pack(a, a)); }
2125
2126inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
2127{ v_store_low(ptr, v_pack_u(a, a)); }
2128
2129template<int n> inline
2130v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
2131{
2132 __m256i res = __lasx_xvssrlrni_bu_h(b.val, a.val, n);
2133 return v_uint8x32(_v256_shuffle_odd_64(res));
2134}
2135
2136template<int n> inline
2137void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
2138{
2139 __m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
2140 __lasx_xvstelm_d(res, ptr, 0, 0);
2141 __lasx_xvstelm_d(res, ptr, 8, 2);
2142}
2143
2144template<int n> inline
2145v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
2146{
2147 __m256i res = __lasx_xvssrarni_bu_h(b.val, a.val, n);
2148 return v_uint8x32(_v256_shuffle_odd_64(res));
2149}
2150
2151template<int n> inline
2152void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
2153{
2154 __m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
2155 __lasx_xvstelm_d(res, ptr, 0, 0);
2156 __lasx_xvstelm_d(res, ptr, 8, 2);
2157}
2158
2159template<int n> inline
2160v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
2161{
2162 __m256i res = __lasx_xvssrarni_b_h(b.val, a.val, n);
2163 return v_int8x32(_v256_shuffle_odd_64(res));
2164}
2165
2166template<int n> inline
2167void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
2168{
2169 __m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
2170 __lasx_xvstelm_d(res, ptr, 0, 0);
2171 __lasx_xvstelm_d(res, ptr, 8, 2);
2172}
2173
2174// 32
2175inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
2176{ return v_int16x16(_v256_shuffle_odd_64(_lasx_packs_w(a.val, b.val))); }
2177
2178inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
2179{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
2180
2181inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
2182{ return v_uint16x16(_v256_shuffle_odd_64(_lasx_packus_w(a.val, b.val))); }
2183
2184inline void v_pack_store(short* ptr, const v_int32x8& a)
2185{ v_store_low(ptr, v_pack(a, a)); }
2186
2187inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
2188{
2189 __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
2190 __lasx_xvstelm_d(res, ptr, 0, 0);
2191 __lasx_xvstelm_d(res, ptr, 8, 2);
2192}
2193
2194inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
2195{ v_store_low(ptr, v_pack_u(a, a)); }
2196
2197template<int n> inline
2198v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
2199{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrlrni_hu_w(b.val, a.val, n))); }
2200
2201template<int n> inline
2202void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
2203{
2204 __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
2205 __lasx_xvstelm_d(res, ptr, 0, 0);
2206 __lasx_xvstelm_d(res, ptr, 8, 2);
2207}
2208
2209template<int n> inline
2210v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
2211{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_hu_w(b.val, a.val, n))); }
2212
2213template<int n> inline
2214void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
2215{
2216 __m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
2217 __lasx_xvstelm_d(res, ptr, 0, 0);
2218 __lasx_xvstelm_d(res, ptr, 8, 2);
2219}
2220
2221template<int n> inline
2222v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
2223{ return v_int16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_h_w(b.val, a.val, n))); }
2224
2225template<int n> inline
2226void v_rshr_pack_store(short* ptr, const v_int32x8& a)
2227{
2228 __m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
2229 __lasx_xvstelm_d(res, ptr, 0, 0);
2230 __lasx_xvstelm_d(res, ptr, 8, 2);
2231}
2232
2233// 64
2234// Non-saturating pack
2235inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
2236{
2237 __m256i ab = __lasx_xvpickev_w(b.val, a.val);
2238 return v_uint32x8(_v256_shuffle_odd_64(ab));
2239}
2240
2241inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
2242{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2243
2244inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
2245{
2246 __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
2247 v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
2248}
2249
2250inline void v_pack_store(int* ptr, const v_int64x4& b)
2251{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
2252
2253template<int n> inline
2254v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
2255{ return v_uint32x8(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(b.val, a.val, n))); }
2256
2257template<int n> inline
2258void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
2259{
2260 __m256i res = __lasx_xvsrlrni_w_d(a.val, a.val, n);
2261 __lasx_xvstelm_d(res, ptr, 0, 0);
2262 __lasx_xvstelm_d(res, ptr, 8, 2);
2263}
2264
2265template<int n> inline
2266v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
2267{ return v_int32x8(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(b.val, a.val, n))); }
2268
2269template<int n> inline
2270void v_rshr_pack_store(int* ptr, const v_int64x4& a)
2271{
2272 __m256i res = __lasx_xvsrarni_w_d(a.val, a.val, n);
2273 __lasx_xvstelm_d(res, ptr, 0, 0);
2274 __lasx_xvstelm_d(res, ptr, 8, 2);
2275}
2276
2277// pack boolean
2278inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
2279{
2280 __m256i ab = _lasx_packs_h(a.val, b.val);
2281 return v_uint8x32(_v256_shuffle_odd_64(ab));
2282}
2283
2284inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
2285 const v_uint32x8& c, const v_uint32x8& d)
2286{
2287 __m256i ab = _lasx_packs_w(a.val, b.val);
2288 __m256i cd = _lasx_packs_w(c.val, d.val);
2289
2290 __m256i abcd = _v256_shuffle_odd_64(_lasx_packs_h(ab, cd));
2291 return v_uint8x32(__lasx_xvshuf4i_w(abcd, 0xd8));
2292}
2293
2294inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
2295 const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
2296 const v_uint64x4& g, const v_uint64x4& h)
2297{
2298 __m256i ab = _lasx_packs_w(a.val, b.val);
2299 __m256i cd = _lasx_packs_w(c.val, d.val);
2300 __m256i ef = _lasx_packs_w(e.val, f.val);
2301 __m256i gh = _lasx_packs_w(g.val, h.val);
2302
2303 __m256i abcd = _lasx_packs_w(ab, cd);
2304 __m256i efgh = _lasx_packs_w(ef, gh);
2305 __m256i pkall = _v256_shuffle_odd_64(_lasx_packs_h(abcd, efgh));
2306
2307 __m256i rev = _v256_alignr_b(pkall, pkall, 8);
2308 return v_uint8x32(__lasx_xvilvl_h(rev, pkall));
2309}
2310
2311/* Recombine */
2312// its up there with load and store operations
2313
2314/* Extract */
2315#define OPENCV_HAL_IMPL_LASX_EXTRACT(_Tpvec) \
2316 template<int s> \
2317 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2318 { return v_rotate_right<s>(a, b); }
2319
2320OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint8x32)
2321OPENCV_HAL_IMPL_LASX_EXTRACT(v_int8x32)
2322OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint16x16)
2323OPENCV_HAL_IMPL_LASX_EXTRACT(v_int16x16)
2324OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint32x8)
2325OPENCV_HAL_IMPL_LASX_EXTRACT(v_int32x8)
2326OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint64x4)
2327OPENCV_HAL_IMPL_LASX_EXTRACT(v_int64x4)
2328OPENCV_HAL_IMPL_LASX_EXTRACT(v_float32x8)
2329OPENCV_HAL_IMPL_LASX_EXTRACT(v_float64x4)
2330
2331template<int i>
2332inline uchar v_extract_n(v_uint8x32 a)
2333{
2334 return (uchar)_v256_extract_b<i>(a.val);
2335}
2336
2337template<int i>
2338inline schar v_extract_n(v_int8x32 a)
2339{
2340 return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
2341}
2342
2343template<int i>
2344inline ushort v_extract_n(v_uint16x16 a)
2345{
2346 return (ushort)_v256_extract_h<i>(a.val);
2347}
2348
2349template<int i>
2350inline short v_extract_n(v_int16x16 a)
2351{
2352 return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
2353}
2354
2355template<int i>
2356inline uint v_extract_n(v_uint32x8 a)
2357{
2358 return (uint)_v256_extract_w<i>(a.val);
2359}
2360
2361template<int i>
2362inline int v_extract_n(v_int32x8 a)
2363{
2364 return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
2365}
2366
2367template<int i>
2368inline uint64 v_extract_n(v_uint64x4 a)
2369{
2370 return (uint64)_v256_extract_d<i>(a.val);
2371}
2372
2373template<int i>
2374inline int64 v_extract_n(v_int64x4 v)
2375{
2376 return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
2377}
2378
2379template<int i>
2380inline float v_extract_n(v_float32x8 v)
2381{
2382 union { uint iv; float fv; } d;
2383 d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
2384 return d.fv;
2385}
2386
2387template<int i>
2388inline double v_extract_n(v_float64x4 v)
2389{
2390 union { uint64 iv; double dv; } d;
2391 d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
2392 return d.dv;
2393}
2394
2395template<int i>
2396inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
2397{
2398 static const __m256i perm = __lasx_xvreplgr2vr_w((char)i);
2399 return v_uint32x8(__lasx_xvperm_w(a.val, perm));
2400}
2401
2402template<int i>
2403inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
2404{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2405
2406template<int i>
2407inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
2408{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2409
2411
2412inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b)
2413{
2414 __m256i t0 = __lasx_xvld(ptr, 0);
2415 __m256i t1 = __lasx_xvld(ptr, 32);
2416
2417 __m256i p0 = __lasx_xvpickev_b(t1, t0);
2418 __m256i p1 = __lasx_xvpickod_b(t1, t0);
2419
2420 a.val = __lasx_xvpermi_d(p0, 0xd8);
2421 b.val = __lasx_xvpermi_d(p1, 0xd8);
2422}
2423
2424inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
2425{
2426 __m256i t0 = __lasx_xvld(ptr, 0);
2427 __m256i t1 = __lasx_xvld(ptr, 32);
2428
2429 __m256i p0 = __lasx_xvpickev_h(t1, t0);
2430 __m256i p1 = __lasx_xvpickod_h(t1, t0);
2431
2432 a.val = __lasx_xvpermi_d(p0, 0xd8);
2433 b.val = __lasx_xvpermi_d(p1, 0xd8);
2434}
2435
2436inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
2437{
2438 __m256i t0 = __lasx_xvld(ptr, 0);
2439 __m256i t1 = __lasx_xvld(ptr, 32);
2440
2441 __m256i p0 = __lasx_xvpickev_w(t1, t0);
2442 __m256i p1 = __lasx_xvpickod_w(t1, t0);
2443
2444 a.val = __lasx_xvpermi_d(p0, 0xd8);
2445 b.val = __lasx_xvpermi_d(p1, 0xd8);
2446}
2447
2448inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
2449{
2450 __m256i ab0 = __lasx_xvld(ptr, 0);
2451 __m256i ab1 = __lasx_xvld(ptr, 32);
2452
2453 __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02);
2454 __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13);
2455 __m256i a0 = __lasx_xvilvl_d(ph, pl);
2456 __m256i b0 = __lasx_xvilvh_d(ph, pl);
2457 a = v_uint64x4(a0);
2458 b = v_uint64x4(b0);
2459}
2460
2461inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
2462{
2463 __m256i bgr0 = __lasx_xvld(ptr, 0);
2464 __m256i bgr1 = __lasx_xvld(ptr, 32);
2465 __m256i bgr2 = __lasx_xvld(ptr, 64);
2466
2467 __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2468 __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2469
2470 const __m256i m0 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2471 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2472 const __m256i m1 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2473 -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
2474
2475 __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
2476 __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
2477 __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
2478
2479 const __m256i
2480 sh_b = _v256_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
2481 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
2482 sh_g = _v256_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
2483 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
2484 sh_r = _v256_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
2485 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2486 b0 = __lasx_xvshuf_b(b0, b0, sh_b);
2487 g0 = __lasx_xvshuf_b(g0, g0, sh_g);
2488 r0 = __lasx_xvshuf_b(r0, r0, sh_r);
2489
2490 a = v_uint8x32(b0);
2491 b = v_uint8x32(g0);
2492 c = v_uint8x32(r0);
2493}
2494
2495inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
2496{
2497 __m256i bgr0 = __lasx_xvld(ptr, 0);
2498 __m256i bgr1 = __lasx_xvld(ptr, 32);
2499 __m256i bgr2 = __lasx_xvld(ptr, 64);
2500
2501 __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2502 __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2503
2504 const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2505 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2506 const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2507 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2508 __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
2509 __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
2510 __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
2511 const __m256i sh_b = _v256_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2512 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2513 const __m256i sh_g = _v256_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2514 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2515 const __m256i sh_r = _v256_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2516 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2517 b0 = __lasx_xvshuf_b(b0, b0, sh_b);
2518 g0 = __lasx_xvshuf_b(g0, g0, sh_g);
2519 r0 = __lasx_xvshuf_b(r0, r0, sh_r);
2520
2521 a = v_uint16x16(b0);
2522 b = v_uint16x16(g0);
2523 c = v_uint16x16(r0);
2524}
2525
2526inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
2527{
2528 __m256i bgr0 = __lasx_xvld(ptr, 0);
2529 __m256i bgr1 = __lasx_xvld(ptr, 32);
2530 __m256i bgr2 = __lasx_xvld(ptr, 64);
2531
2532 __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2533 __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2534
2535 __m256i m24 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
2536 __m256i m92 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
2537 __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m24), bgr1, m92);
2538 __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m92), bgr1, m24);
2539 __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m24), s02_high, m92);
2540
2541 b0 = __lasx_xvshuf4i_w(b0, 0x6c);
2542 g0 = __lasx_xvshuf4i_w(g0, 0xb1);
2543 r0 = __lasx_xvshuf4i_w(r0, 0xc6);
2544
2545 a = v_uint32x8(b0);
2546 b = v_uint32x8(g0);
2547 c = v_uint32x8(r0);
2548}
2549
2550inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
2551{
2552 __m256i bgr0 = __lasx_xvld(ptr, 0);
2553 __m256i bgr1 = __lasx_xvld(ptr, 32);
2554 __m256i bgr2 = __lasx_xvld(ptr, 64);
2555
2556 __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12); // get bgr0 low 128 and bgr1 high 128
2557 __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12);
2558 __m256i s20r = __lasx_xvpermi_d(__lasx_xvpermi_q(bgr2, bgr0, 0x12), 0x1b);
2559 __m256i b0 = __lasx_xvilvl_d(s20r, s01);
2560 __m256i g0 = _v256_alignr_b(s12, s01, 8);
2561 __m256i r0 = __lasx_xvilvh_d(s12, s20r);
2562
2563 a = v_uint64x4(b0);
2564 b = v_uint64x4(g0);
2565 c = v_uint64x4(r0);
2566}
2567
2568inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d)
2569{
2570 __m256i t0 = __lasx_xvld(ptr, 0);
2571 __m256i t1 = __lasx_xvld(ptr, 32);
2572 __m256i t2 = __lasx_xvld(ptr, 64);
2573 __m256i t3 = __lasx_xvld(ptr, 96);
2574
2575 const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
2576 __m256i ac_lo = __lasx_xvpickev_b(t1, t0);
2577 __m256i bd_lo = __lasx_xvpickod_b(t1, t0);
2578 __m256i ac_hi = __lasx_xvpickev_b(t3, t2);
2579 __m256i bd_hi = __lasx_xvpickod_b(t3, t2);
2580
2581 __m256i a_pre = __lasx_xvpickev_b(ac_hi, ac_lo);
2582 __m256i c_pre = __lasx_xvpickod_b(ac_hi, ac_lo);
2583 __m256i b_pre = __lasx_xvpickev_b(bd_hi, bd_lo);
2584 __m256i d_pre = __lasx_xvpickod_b(bd_hi, bd_lo);
2585
2586 a.val = __lasx_xvperm_w(a_pre, sh);
2587 b.val = __lasx_xvperm_w(b_pre, sh);
2588 c.val = __lasx_xvperm_w(c_pre, sh);
2589 d.val = __lasx_xvperm_w(d_pre, sh);
2590}
2591
2592inline void v_load_deinterleave(const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d)
2593{
2594 __m256i t0 = __lasx_xvld(ptr, 0);
2595 __m256i t1 = __lasx_xvld(ptr, 32);
2596 __m256i t2 = __lasx_xvld(ptr, 64);
2597 __m256i t3 = __lasx_xvld(ptr, 96);
2598
2599 const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
2600 __m256i ac_lo = __lasx_xvpickev_h(t1, t0);
2601 __m256i bd_lo = __lasx_xvpickod_h(t1, t0);
2602 __m256i ac_hi = __lasx_xvpickev_h(t3, t2);
2603 __m256i bd_hi = __lasx_xvpickod_h(t3, t2);
2604
2605 __m256i a_pre = __lasx_xvpickev_h(ac_hi, ac_lo);
2606 __m256i c_pre = __lasx_xvpickod_h(ac_hi, ac_lo);
2607 __m256i b_pre = __lasx_xvpickev_h(bd_hi, bd_lo);
2608 __m256i d_pre = __lasx_xvpickod_h(bd_hi, bd_lo);
2609
2610 a.val = __lasx_xvperm_w(a_pre, sh);
2611 b.val = __lasx_xvperm_w(b_pre, sh);
2612 c.val = __lasx_xvperm_w(c_pre, sh);
2613 d.val = __lasx_xvperm_w(d_pre, sh);
2614}
2615
2616inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
2617{
2618 __m256i p0 = __lasx_xvld(ptr, 0);
2619 __m256i p1 = __lasx_xvld(ptr, 32);
2620 __m256i p2 = __lasx_xvld(ptr, 64);
2621 __m256i p3 = __lasx_xvld(ptr, 96);
2622
2623 __m256i p01l = __lasx_xvilvl_w(p1, p0);
2624 __m256i p01h = __lasx_xvilvh_w(p1, p0);
2625 __m256i p23l = __lasx_xvilvl_w(p3, p2);
2626 __m256i p23h = __lasx_xvilvh_w(p3, p2);
2627
2628 __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
2629 __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
2630 __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
2631 __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
2632
2633 __m256i b0 = __lasx_xvilvl_w(plh, pll);
2634 __m256i g0 = __lasx_xvilvh_w(plh, pll);
2635 __m256i r0 = __lasx_xvilvl_w(phh, phl);
2636 __m256i a0 = __lasx_xvilvh_w(phh, phl);
2637
2638 a = v_uint32x8(b0);
2639 b = v_uint32x8(g0);
2640 c = v_uint32x8(r0);
2641 d = v_uint32x8(a0);
2642}
2643
2644inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
2645{
2646 __m256i bgra0 = __lasx_xvld(ptr, 0);
2647 __m256i bgra1 = __lasx_xvld(ptr, 32);
2648 __m256i bgra2 = __lasx_xvld(ptr, 64);
2649 __m256i bgra3 = __lasx_xvld(ptr, 96);
2650
2651 __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02);
2652 __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13);
2653 __m256i l13 = __lasx_xvpermi_q(bgra1, bgra3, 0x02);
2654 __m256i h13 = __lasx_xvpermi_q(bgra1, bgra3, 0x13);
2655
2656 __m256i b0 = __lasx_xvilvl_d(l13, l02);
2657 __m256i g0 = __lasx_xvilvh_d(l13, l02);
2658 __m256i r0 = __lasx_xvilvl_d(h13, h02);
2659 __m256i a0 = __lasx_xvilvh_d(h13, h02);
2660
2661 a = v_uint64x4(b0);
2662 b = v_uint64x4(g0);
2663 c = v_uint64x4(r0);
2664 d = v_uint64x4(a0);
2665}
2666
2668
2669inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
2671{
2672 __m256i xy_l = __lasx_xvilvl_b(y.val, x.val);
2673 __m256i xy_h = __lasx_xvilvh_b(y.val, x.val);
2674
2675 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2676 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2677
2678 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2679 __lasx_xvst(xy1, (__m256i*)ptr, 32*1);
2680}
2681
2682inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
2684{
2685 __m256i xy_l = __lasx_xvilvl_h(y.val, x.val);
2686 __m256i xy_h = __lasx_xvilvh_h(y.val, x.val);
2687
2688 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2689 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2690
2691 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2692 __lasx_xvst(xy1, (__m256i*)ptr, 16*2);
2693}
2694
2695inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
2697{
2698 __m256i xy_l = __lasx_xvilvl_w(y.val, x.val);
2699 __m256i xy_h = __lasx_xvilvh_w(y.val, x.val);
2700
2701 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2702 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2703
2704 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2705 __lasx_xvst(xy1, (__m256i*)ptr, 8*4);
2706}
2707
2708inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
2710{
2711 __m256i xy_l = __lasx_xvilvl_d(y.val, x.val);
2712 __m256i xy_h = __lasx_xvilvh_d(y.val, x.val);
2713
2714 __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2715 __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2716
2717 __lasx_xvst(xy0, (__m256i*)ptr, 0);
2718 __lasx_xvst(xy1, (__m256i*)ptr, 4*8);
2719}
2720
2721inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
2723{
2724 const __m256i sh_b = _v256_setr_b(
2725 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2726 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2727 const __m256i sh_g = _v256_setr_b(
2728 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2729 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2730 const __m256i sh_r = _v256_setr_b(
2731 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2732 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2733
2734 __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
2735 __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
2736 __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
2737
2738 const __m256i m0 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2739 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2740 const __m256i m1 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2741 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2742
2743 __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
2744 __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
2745 __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
2746
2747 __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
2748 __m256i bgr1 = __lasx_xvpermi_q(p0, p2, 0 + 3*16);
2749 __m256i bgr2 = __lasx_xvpermi_q(p2, p1, 1 + 3*16);
2750
2751 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2752 __lasx_xvst(bgr1, (__m256i*)ptr, 32);
2753 __lasx_xvst(bgr2, (__m256i*)ptr, 64);
2754}
2755
2756inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
2758{
2759 const __m256i sh_b = _v256_setr_b(
2760 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2761 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2762 const __m256i sh_g = _v256_setr_b(
2763 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2764 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2765 const __m256i sh_r = _v256_setr_b(
2766 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2767 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2768
2769 __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
2770 __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
2771 __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
2772
2773 const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2774 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2775 const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2776 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2777
2778 __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
2779 __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
2780 __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
2781
2782 __m256i bgr0 = __lasx_xvpermi_q(p2, p0, 0 + 2*16);
2783 __m256i bgr2 = __lasx_xvpermi_q(p2, p0, 1 + 3*16);
2784
2785 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2786 __lasx_xvst(p1, (__m256i*)ptr, 16*2);
2787 __lasx_xvst(bgr2, (__m256i*)ptr, 32*2);
2788}
2789
2790inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
2792{
2793 __m256i b0 = __lasx_xvshuf4i_w(a.val, 0x6c);
2794 __m256i g0 = __lasx_xvshuf4i_w(b.val, 0xb1);
2795 __m256i r0 = __lasx_xvshuf4i_w(c.val, 0xc6);
2796
2797 __m256i bitmask_1 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
2798 __m256i bitmask_2 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
2799
2800 __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, bitmask_1), r0, bitmask_2);
2801 __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, bitmask_1), b0, bitmask_2);
2802 __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, bitmask_1), g0, bitmask_2);
2803
2804 __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
2805 __m256i bgr2 = __lasx_xvpermi_q(p1, p0, 1 + 3*16);
2806
2807 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2808 __lasx_xvst(p2, (__m256i*)ptr, 8*4);
2809 __lasx_xvst(bgr2, (__m256i*)ptr, 16*4);
2810}
2811
2812inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
2814{
2815 __m256i s01 = __lasx_xvilvl_d(b.val, a.val);
2816 __m256i s12 = __lasx_xvilvh_d(c.val, b.val);
2817 __m256i s20 = __lasx_xvpermi_w(a.val, c.val, 0xe4);
2818
2819 __m256i bgr0 = __lasx_xvpermi_q(s20, s01, 0 + 2*16);
2820 __m256i bgr1 = __lasx_xvpermi_q(s01, s12, 0x30);
2821 __m256i bgr2 = __lasx_xvpermi_q(s12, s20, 1 + 3*16);
2822
2823 __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2824 __lasx_xvst(bgr1, (__m256i*)ptr, 4*8);
2825 __lasx_xvst(bgr2, (__m256i*)ptr, 8*8);
2826}
2827
2828inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
2829 const v_uint8x32& c, const v_uint8x32& d,
2831{
2832 __m256i bg0 = __lasx_xvilvl_b(b.val, a.val);
2833 __m256i bg1 = __lasx_xvilvh_b(b.val, a.val);
2834 __m256i ra0 = __lasx_xvilvl_b(d.val, c.val);
2835 __m256i ra1 = __lasx_xvilvh_b(d.val, c.val);
2836
2837 __m256i bgra0_ = __lasx_xvilvl_h(ra0, bg0);
2838 __m256i bgra1_ = __lasx_xvilvh_h(ra0, bg0);
2839 __m256i bgra2_ = __lasx_xvilvl_h(ra1, bg1);
2840 __m256i bgra3_ = __lasx_xvilvh_h(ra1, bg1);
2841
2842 __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2843 __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2844 __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2845 __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2846
2847 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2848 __lasx_xvst(bgra1, (__m256i*)ptr, 32);
2849 __lasx_xvst(bgra2, (__m256i*)ptr, 64);
2850 __lasx_xvst(bgra3, (__m256i*)ptr, 96);
2851}
2852
2853inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
2854 const v_uint16x16& c, const v_uint16x16& d,
2856{
2857 __m256i bg0 = __lasx_xvilvl_h(b.val, a.val);
2858 __m256i bg1 = __lasx_xvilvh_h(b.val, a.val);
2859 __m256i ra0 = __lasx_xvilvl_h(d.val, c.val);
2860 __m256i ra1 = __lasx_xvilvh_h(d.val, c.val);
2861
2862 __m256i bgra0_ = __lasx_xvilvl_w(ra0, bg0);
2863 __m256i bgra1_ = __lasx_xvilvh_w(ra0, bg0);
2864 __m256i bgra2_ = __lasx_xvilvl_w(ra1, bg1);
2865 __m256i bgra3_ = __lasx_xvilvh_w(ra1, bg1);
2866
2867 __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2868 __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2869 __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2870 __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2871
2872 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2873 __lasx_xvst(bgra1, (__m256i*)ptr, 16*2);
2874 __lasx_xvst(bgra2, (__m256i*)ptr, 32*2);
2875 __lasx_xvst(bgra3, (__m256i*)ptr, 48*2);
2876}
2877
2878inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
2879 const v_uint32x8& c, const v_uint32x8& d,
2881{
2882 __m256i bg0 = __lasx_xvilvl_w(b.val, a.val);
2883 __m256i bg1 = __lasx_xvilvh_w(b.val, a.val);
2884 __m256i ra0 = __lasx_xvilvl_w(d.val, c.val);
2885 __m256i ra1 = __lasx_xvilvh_w(d.val, c.val);
2886
2887 __m256i bgra0_ = __lasx_xvilvl_d(ra0, bg0);
2888 __m256i bgra1_ = __lasx_xvilvh_d(ra0, bg0);
2889 __m256i bgra2_ = __lasx_xvilvl_d(ra1, bg1);
2890 __m256i bgra3_ = __lasx_xvilvh_d(ra1, bg1);
2891
2892 __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2893 __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2894 __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2895 __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2896
2897 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2898 __lasx_xvst(bgra1, (__m256i*)ptr, 8*4);
2899 __lasx_xvst(bgra2, (__m256i*)ptr, 16*4);
2900 __lasx_xvst(bgra3, (__m256i*)ptr, 24*4);
2901}
2902
2903inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
2904 const v_uint64x4& c, const v_uint64x4& d,
2906{
2907 __m256i bg0 = __lasx_xvilvl_d(b.val, a.val);
2908 __m256i bg1 = __lasx_xvilvh_d(b.val, a.val);
2909 __m256i ra0 = __lasx_xvilvl_d(d.val, c.val);
2910 __m256i ra1 = __lasx_xvilvh_d(d.val, c.val);
2911
2912 __m256i bgra0 = __lasx_xvpermi_q(ra0, bg0, 0 + 2*16);
2913 __m256i bgra1 = __lasx_xvpermi_q(ra1, bg1, 0 + 2*16);
2914 __m256i bgra2 = __lasx_xvpermi_q(ra0, bg0, 1 + 3*16);
2915 __m256i bgra3 = __lasx_xvpermi_q(ra1, bg1, 1 + 3*16);
2916
2917 __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2918 __lasx_xvst(bgra1, (__m256i*)(ptr), 4*8);
2919 __lasx_xvst(bgra2, (__m256i*)(ptr), 8*8);
2920 __lasx_xvst(bgra3, (__m256i*)(ptr), 12*8);
2921}
2922
2923
2924#define OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2925inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2926{ \
2927 _Tpvec1 a1, b1; \
2928 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2929 a0 = v_reinterpret_as_##suffix0(a1); \
2930 b0 = v_reinterpret_as_##suffix0(b1); \
2931} \
2932inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2933{ \
2934 _Tpvec1 a1, b1, c1; \
2935 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2936 a0 = v_reinterpret_as_##suffix0(a1); \
2937 b0 = v_reinterpret_as_##suffix0(b1); \
2938 c0 = v_reinterpret_as_##suffix0(c1); \
2939} \
2940inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2941{ \
2942 _Tpvec1 a1, b1, c1, d1; \
2943 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2944 a0 = v_reinterpret_as_##suffix0(a1); \
2945 b0 = v_reinterpret_as_##suffix0(b1); \
2946 c0 = v_reinterpret_as_##suffix0(c1); \
2947 d0 = v_reinterpret_as_##suffix0(d1); \
2948} \
2949inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2950 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2951{ \
2952 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2953 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2954 v_store_interleave((_Tp1*)ptr, a1, b1/*, mode*/); \
2955} \
2956inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
2957 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2958{ \
2959 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2960 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2961 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2962 v_store_interleave((_Tp1*)ptr, a1, b1, c1/*, mode*/); \
2963} \
2964inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2965 const _Tpvec0& c0, const _Tpvec0& d0, \
2966 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2967{ \
2968 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2969 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2970 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2971 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2972 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1/*, mode*/); \
2973}
2974
2975OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
2976OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
2977OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
2978OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
2979OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
2980OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
2981
2982//
2983// FP16
2984//
2985
2986inline v_float32x8 v256_load_expand(const hfloat* ptr)
2987{
2988#if CV_FP16
2989 //1-load128, 2-permi, 3-cvt
2990 return v_float32x8(__lasx_xvfcvtl_s_h(__lasx_xvpermi_d(__lsx_vld((const __m128i*)ptr, 0), 0x10)));
2991#else
2992 float CV_DECL_ALIGNED(32) buf[8];
2993 for (int i = 0; i < 8; i++)
2994 buf[i] = (float)ptr[i];
2995 return v256_load_aligned(buf);
2996#endif
2997}
2998
2999inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
3000{
3001#if CV_FP16
3002 __m256i ah = __lasx_xvfcvt_h_s(a.val, a.val);
3003 __lsx_vst((_m128i)ah, ptr, 0);
3004#else
3005 float CV_DECL_ALIGNED(32) buf[8];
3006 v_store_aligned(buf, a);
3007 for (int i = 0; i < 8; i++)
3008 ptr[i] = hfloat(buf[i]);
3009#endif
3010}
3011
3012//
3013// end of FP16
3014//
3015
3016inline void v256_cleanup() {}
3017
3018CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3019
3021
3022} // cv::
3023
3024#endif // OPENCV_HAL_INTRIN_LASX_HPP
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr const CvArr CvArr * result
Definition core_c.h:1423
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition intrin_cpp.hpp:1554
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition intrin_cpp.hpp:2216
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition intrin_cpp.hpp:1451
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition intrin_cpp.hpp:1496
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition intrin_cpp.hpp:1474
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition intrin_cpp.hpp:2761
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition intrin_cpp.hpp:1515
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
StoreMode
Definition intrin.hpp:100
@ STORE_UNALIGNED
Definition intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition dualquaternion.inl.hpp:274
T swap(T... args)