EstervQrCode 1.1.1
Library for qr code manipulation
intrin_lasx.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4 
5 #ifndef OPENCV_HAL_INTRIN_LASX_HPP
6 #define OPENCV_HAL_INTRIN_LASX_HPP
7 
8 #include <lsxintrin.h>
9 #include <lasxintrin.h>
10 
11 #define CV_SIMD256 1
12 #define CV_SIMD256_64F 1
13 #define CV_SIMD256_FP16 0
14 
15 namespace cv
16 {
17 
19 
20 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
21 
23 
24 inline __m256i _v256_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8, char v9,
25  char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19,
26  char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29,
27  char v30, char v31)
28 {
29  return (__m256i)v32i8{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
30  v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
31  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
32  v30, v31 };
33 }
34 
35 inline __m256i _v256_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8, char v9,
36  char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19,
37  char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29,
38  char v30, char v31)
39 {
40  return (__m256i)v32i8{ v31, v30,
41  v29, v28, v27, v26, v25, v24, v23, v22, v21, v20,
42  v19, v18, v17, v16, v15, v14, v13, v12, v11, v10,
43  v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 };
44 }
45 
46 inline __m256i _v256_setr_h(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7,
47  short v8, short v9, short v10, short v11, short v12, short v13, short v14, short v15)
48 {
49  return (__m256i)v16i16{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 };
50 }
51 
52 inline __m256i _v256_setr_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
53 {
54  return (__m256i)v8i32{ v0, v1, v2, v3, v4, v5, v6, v7 };
55 }
56 
57 inline __m256i _v256_set_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
58 {
59  return (__m256i)v8i32{ v7, v6, v5, v4, v3, v2, v1, v0 };
60 }
61 
62 inline __m256i _v256_setall_w(int v0)
63 {
64  return (__m256i)v8i32{ v0, v0, v0, v0, v0, v0, v0, v0 };
65 }
66 
67 inline __m256i _v256_setr_d(int64 v0, int64 v1, int64 v2, int64 v3)
68 {
69  return (__m256i)v4i64{ v0, v1, v2, v3 };
70 }
71 
72 inline __m256i _v256_set_d(int64 v0, int64 v1, int64 v2, int64 v3)
73 {
74  return (__m256i)v4i64{ v3, v2, v1, v0 };
75 }
76 
77 inline __m256 _v256_setr_ps(float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7)
78 {
79  return (__m256)v8f32{ v0, v1, v2, v3, v4, v5, v6, v7 };
80 }
81 
82 inline __m256 _v256_setall_ps(float f32)
83 {
84  return (__m256)v8f32{ f32, f32, f32, f32, f32, f32, f32, f32 };
85 }
86 
87 inline __m256d _v256_setr_pd(double v0, double v1, double v2, double v3)
88 {
89  return (__m256d)v4f64{ v0, v1, v2, v3 };
90 }
91 
92 inline __m256d _v256_setall_pd(double f64)
93 {
94  return (__m256d)v4f64{ f64, f64, f64, f64 };
95 }
96 
97 inline __m256i _lasx_packus_h(const __m256i& a, const __m256i& b)
98 {
99  return __lasx_xvssrarni_bu_h(b, a, 0);
100 }
101 
102 inline __m256i _lasx_packs_h(const __m256i& a, const __m256i& b)
103 {
104  return __lasx_xvssrarni_b_h(b, a, 0);
105 }
106 
107 inline __m256i _lasx_packus_w(const __m256i& a, const __m256i& b)
108 {
109  return __lasx_xvssrarni_hu_w(b, a, 0);
110 }
111 
112 inline __m256i _lasx_packs_w(const __m256i& a, const __m256i& b)
113 {
114  return __lasx_xvssrarni_h_w(b, a, 0);
115 }
116 
117 inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
118 { return __lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02); }
119 
120 inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
121 { return __m256(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
122 
123 inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
124 { return __m256d(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
125 
126 inline __m256i _v256_shuffle_odd_64(const __m256i& v)
127 { return __lasx_xvpermi_d(v, 0xd8); }
128 
129 inline __m256d _v256_shuffle_odd_64(const __m256d& v)
130 { return __m256d(__lasx_xvpermi_d(*((__m256i*)&v), 0xd8)); }
131 
132 //LASX: only use for permute WITHOUT zero clearing
133 template<int imm>
134 inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
135 { return __lasx_xvpermi_q(a, b, imm); }
136 
137 template<int imm>
138 inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
139 { return __m256(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
140 
141 template<int imm>
142 inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
143 { return __m256d(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
144 
145 template<int imm, typename _Tpvec>
146 inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
147 { return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
148 
149 template<int imm>
150 inline __m256i _v256_permute4x64(const __m256i& a)
151 { return __lasx_xvpermi_d(a, imm); }
152 
153 template<int imm>
154 inline __m256d _v256_permute4x64(const __m256d& a)
155 { return __m256d(__lasx_xvpermi_d(*((__m256i*)&a), imm)); }
156 
157 template<int imm, typename _Tpvec>
158 inline _Tpvec v256_permute4x64(const _Tpvec& a)
159 { return _Tpvec(_v256_permute4x64<imm>(a.val)); }
160 
161 inline __m128i _v256_extract_high(const __m256i& v)
162 { __m256i temp256i = __lasx_xvpermi_d(v, 0x4E);
163  return *((__m128i*)&temp256i); }
164 
165 inline __m128 _v256_extract_high(const __m256& v)
166 { return __m128(_v256_extract_high(*((__m256i*)&v))); }
167 
168 inline __m128d _v256_extract_high(const __m256d& v)
169 { return __m128d(_v256_extract_high(*((__m256i*)&v))); }
170 
171 inline __m128i _v256_extract_low(const __m256i& v)
172 { return *((__m128i*)&v); }
173 
174 inline __m128 _v256_extract_low(const __m256& v)
175 { return __m128(_v256_extract_low(*((__m256i*)&v))); }
176 
177 inline __m128d _v256_extract_low(const __m256d& v)
178 { return __m128d(_v256_extract_low(*((__m256i*)&v))); }
179 
180 inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
181 {
182  return __lasx_xvssrlrni_hu_w(b, a, 0);
183 }
184 
185 template<int i>
186 inline int _v256_extract_b(const __m256i& a)
187 {
188  int des[1] = {0};
189  __lasx_xvstelm_b(a, des, 0, i);
190  return des[0];
191 }
192 
193 template<int i>
194 inline int _v256_extract_h(const __m256i& a)
195 {
196  int des[1] = {0};
197  __lasx_xvstelm_h(a, des, 0, i);
198  return des[0];
199 }
200 
201 template<int i>
202 inline int _v256_extract_w(const __m256i& a)
203 {
204  return __lasx_xvpickve2gr_w(a, i);
205 }
206 
207 template<int i>
208 inline int64 _v256_extract_d(const __m256i& a)
209 {
210  return __lasx_xvpickve2gr_d(a, i);
211 }
212 
214 
215 struct v_uint8x32
216 {
217  typedef uchar lane_type;
218  enum { nlanes = 32 };
219  __m256i val;
220 
221  explicit v_uint8x32(__m256i v) : val(v) {}
222  v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3,
223  uchar v4, uchar v5, uchar v6, uchar v7,
224  uchar v8, uchar v9, uchar v10, uchar v11,
225  uchar v12, uchar v13, uchar v14, uchar v15,
226  uchar v16, uchar v17, uchar v18, uchar v19,
227  uchar v20, uchar v21, uchar v22, uchar v23,
228  uchar v24, uchar v25, uchar v26, uchar v27,
229  uchar v28, uchar v29, uchar v30, uchar v31)
230  {
231  val = _v256_setr_b((char)v0, (char)v1, (char)v2, (char)v3,
232  (char)v4, (char)v5, (char)v6 , (char)v7, (char)v8, (char)v9,
233  (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
234  (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
235  (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
236  (char)v28, (char)v29, (char)v30, (char)v31);
237  }
238  /* coverity[uninit_ctor]: suppress warning */
239  v_uint8x32() {}
240 
241  uchar get0() const {
242  uchar des[1] = {0};
243  __lasx_xvstelm_b(val, des, 0, 0);
244  return des[0];
245  }
246 };
247 
248 struct v_int8x32
249 {
250  typedef schar lane_type;
251  enum { nlanes = 32 };
252  __m256i val;
253 
254  explicit v_int8x32(__m256i v) : val(v) {}
255  v_int8x32(schar v0, schar v1, schar v2, schar v3,
256  schar v4, schar v5, schar v6, schar v7,
257  schar v8, schar v9, schar v10, schar v11,
258  schar v12, schar v13, schar v14, schar v15,
259  schar v16, schar v17, schar v18, schar v19,
260  schar v20, schar v21, schar v22, schar v23,
261  schar v24, schar v25, schar v26, schar v27,
262  schar v28, schar v29, schar v30, schar v31)
263  {
264  val = _v256_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
265  v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
266  v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
267  }
268  /* coverity[uninit_ctor]: suppress warning */
269  v_int8x32() {}
270 
271  schar get0() const {
272  schar des[1] = {0};
273  __lasx_xvstelm_b(val, des, 0, 0);
274  return des[0];
275  }
276 };
277 
278 struct v_uint16x16
279 {
280  typedef ushort lane_type;
281  enum { nlanes = 16 };
282  __m256i val;
283 
284  explicit v_uint16x16(__m256i v) : val(v) {}
285  v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3,
286  ushort v4, ushort v5, ushort v6, ushort v7,
287  ushort v8, ushort v9, ushort v10, ushort v11,
288  ushort v12, ushort v13, ushort v14, ushort v15)
289  {
290  val = _v256_setr_h((short)v0, (short)v1, (short)v2, (short)v3,
291  (short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9,
292  (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
293  }
294  /* coverity[uninit_ctor]: suppress warning */
295  v_uint16x16() {}
296 
297  ushort get0() const {
298  ushort des[1] = {0};
299  __lasx_xvstelm_h(val, des, 0, 0);
300  return des[0];
301  }
302 };
303 
304 struct v_int16x16
305 {
306  typedef short lane_type;
307  enum { nlanes = 16 };
308  __m256i val;
309 
310  explicit v_int16x16(__m256i v) : val(v) {}
311  v_int16x16(short v0, short v1, short v2, short v3,
312  short v4, short v5, short v6, short v7,
313  short v8, short v9, short v10, short v11,
314  short v12, short v13, short v14, short v15)
315  {
316  val = _v256_setr_h(v0, v1, v2, v3, v4, v5, v6, v7,
317  v8, v9, v10, v11, v12, v13, v14, v15);
318  }
319  /* coverity[uninit_ctor]: suppress warning */
320  v_int16x16() {}
321 
322  short get0() const {
323  short des[1] = {0};
324  __lasx_xvstelm_h(val, des, 0, 0);
325  return des[0];
326  }
327 };
328 
329 struct v_uint32x8
330 {
331  typedef unsigned lane_type;
332  enum { nlanes = 8 };
333  __m256i val;
334 
335  explicit v_uint32x8(__m256i v) : val(v) {}
336  v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
337  unsigned v4, unsigned v5, unsigned v6, unsigned v7)
338  {
339  val = _v256_setr_w((unsigned)v0, (unsigned)v1, (unsigned)v2,
340  (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
341  }
342  /* coverity[uninit_ctor]: suppress warning */
343  v_uint32x8() {}
344 
345  unsigned get0() const { return __lasx_xvpickve2gr_wu(val, 0); }
346 };
347 
348 struct v_int32x8
349 {
350  typedef int lane_type;
351  enum { nlanes = 8 };
352  __m256i val;
353 
354  explicit v_int32x8(__m256i v) : val(v) {}
355  v_int32x8(int v0, int v1, int v2, int v3,
356  int v4, int v5, int v6, int v7)
357  {
358  val = _v256_setr_w(v0, v1, v2, v3, v4, v5, v6, v7);
359  }
360  /* coverity[uninit_ctor]: suppress warning */
361  v_int32x8() {}
362 
363  int get0() const { return __lasx_xvpickve2gr_w(val, 0); }
364 };
365 
366 struct v_float32x8
367 {
368  typedef float lane_type;
369  enum { nlanes = 8 };
370  __m256 val;
371 
372  explicit v_float32x8(__m256 v) : val(v) {}
373  explicit v_float32x8(__m256i v) { val = *((__m256*)&v); }
374  v_float32x8(float v0, float v1, float v2, float v3,
375  float v4, float v5, float v6, float v7)
376  {
377  val = _v256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
378  }
379  /* coverity[uninit_ctor]: suppress warning */
380  v_float32x8() {}
381 
382  float get0() const {
383  float des[1] = {0};
384  __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
385  return des[0];
386  }
387 
388  int get0toint() const {
389  int des[1] = {0};
390  __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
391  return des[0];
392  }
393 };
394 
395 struct v_uint64x4
396 {
397  typedef uint64 lane_type;
398  enum { nlanes = 4 };
399  __m256i val;
400 
401  explicit v_uint64x4(__m256i v) : val(v) {}
402  v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
403  { val = _v256_setr_d((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
404  /* coverity[uninit_ctor]: suppress warning */
405  v_uint64x4() {}
406 
407  uint64 get0() const
408  {
409  return __lasx_xvpickve2gr_du(val, 0);
410  }
411 };
412 
413 struct v_int64x4
414 {
415  typedef int64 lane_type;
416  enum { nlanes = 4 };
417  __m256i val;
418 
419  explicit v_int64x4(__m256i v) : val(v) {}
420  v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
421  { val = _v256_setr_d(v0, v1, v2, v3); }
422  /* coverity[uninit_ctor]: suppress warning */
423  v_int64x4() {}
424 
425  int64 get0() const
426  {
427  return __lasx_xvpickve2gr_d(val, 0);
428  }
429 };
430 
431 struct v_float64x4
432 {
433  typedef double lane_type;
434  enum { nlanes = 4 };
435  __m256d val;
436 
437  explicit v_float64x4(__m256d v) : val(v) {}
438  explicit v_float64x4(__m256i v) { val = *((__m256d*)&v); }
439  v_float64x4(double v0, double v1, double v2, double v3)
440  { val = _v256_setr_pd(v0, v1, v2, v3); }
441  /* coverity[uninit_ctor]: suppress warning */
442  v_float64x4() {}
443 
444  double get0() const {
445  double des[1] = {0};
446  __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
447  return des[0];
448  }
449 
450  int64 get0toint64() const {
451  int64 des[1] = {0};
452  __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
453  return des[0];
454  }
455 };
456 
458 
459 #define OPENCV_HAL_IMPL_LASX_LOADSTORE(_Tpvec, _Tp) \
460  inline _Tpvec v256_load(const _Tp* ptr) \
461  { return _Tpvec(__lasx_xvld(ptr, 0)); } \
462  inline _Tpvec v256_load_aligned(const _Tp* ptr) \
463  { return _Tpvec(__lasx_xvld(ptr, 0)); } \
464  inline _Tpvec v256_load_low(const _Tp* ptr) \
465  { \
466  __m128i v128 = __lsx_vld(ptr, 0); \
467  return _Tpvec(*((__m256i*)&v128)); \
468  } \
469  inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
470  { \
471  __m128i vlo = __lsx_vld(ptr0, 0); \
472  __m128i vhi = __lsx_vld(ptr1, 0); \
473  return _Tpvec(_v256_combine(vlo, vhi)); \
474  } \
475  inline void v_store(_Tp* ptr, const _Tpvec& a) \
476  { __lasx_xvst(a.val, ptr, 0); } \
477  inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
478  { __lasx_xvst(a.val, ptr, 0); } \
479  inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
480  { __lasx_xvst(a.val, ptr, 0); } \
481  inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
482  { \
483  if( mode == hal::STORE_UNALIGNED ) \
484  __lasx_xvst(a.val, ptr, 0); \
485  else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
486  __lasx_xvst(a.val, ptr, 0); \
487  else \
488  __lasx_xvst(a.val, ptr, 0); \
489  } \
490  inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
491  { __lsx_vst(_v256_extract_low(a.val), ptr, 0); } \
492  inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
493  { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
494 
495 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint8x32, uchar)
496 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int8x32, schar)
497 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint16x16, ushort)
498 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int16x16, short)
499 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint32x8, unsigned)
500 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int32x8, int)
501 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint64x4, uint64)
502 OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int64x4, int64)
503 
504 
505 #define OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg) \
506  inline _Tpvec v256_load(const _Tp* ptr) \
507  { return _Tpvec(__lasx_xvld(ptr, 0)); } \
508  inline _Tpvec v256_load_aligned(const _Tp* ptr) \
509  { return _Tpvec(__lasx_xvld(ptr, 0)); } \
510  inline _Tpvec v256_load_low(const _Tp* ptr) \
511  { \
512  __m128i v128 = __lsx_vld(ptr, 0); \
513  return _Tpvec(*((__m256i*)&v128)); \
514  } \
515  inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
516  { \
517  halfreg vlo = __lsx_vld(ptr0, 0); \
518  halfreg vhi = __lsx_vld(ptr1, 0); \
519  return _Tpvec(_v256_combine(vlo, vhi)); \
520  } \
521  inline void v_store(_Tp* ptr, const _Tpvec& a) \
522  { __lasx_xvst(a.val, ptr, 0); } \
523  inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
524  { __lasx_xvst(a.val, ptr, 0); } \
525  inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
526  { __lasx_xvst(a.val, ptr, 0); } \
527  inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
528  { \
529  if( mode == hal::STORE_UNALIGNED ) \
530  __lasx_xvst(a.val, ptr, 0); \
531  else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
532  __lasx_xvst(a.val, ptr, 0); \
533  else \
534  __lasx_xvst(a.val, ptr, 0); \
535  } \
536  inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
537  { __lsx_vst(_v256_extract_low(a.val), ptr, 0); } \
538  inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
539  { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
540 
541 OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float32x8, float, __m128i)
542 OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float64x4, double, __m128i)
543 
544 
545 inline __m256i _lasx_256_castps_si256(const __m256& v)
546 { return __m256i(v); }
547 
548 inline __m256i _lasx_256_castpd_si256(const __m256d& v)
549 { return __m256i(v); }
550 
551 #define OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
552  inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
553  { return _Tpvec(cast(a.val)); }
554 
555 #define OPENCV_HAL_IMPL_LASX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
556  inline _Tpvec v256_setzero_##suffix() \
557  { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
558  inline _Tpvec v256_setall_##suffix(_Tp v) \
559  { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); } \
560  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
561  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
562  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
563  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
564  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
565  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
566  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
567  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
568  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float32x8, suffix, _lasx_256_castps_si256) \
569  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float64x4, suffix, _lasx_256_castpd_si256)
570 
571 OPENCV_HAL_IMPL_LASX_INIT(v_uint8x32, uchar, u8, b, int)
572 OPENCV_HAL_IMPL_LASX_INIT(v_int8x32, schar, s8, b, int)
573 OPENCV_HAL_IMPL_LASX_INIT(v_uint16x16, ushort, u16, h, int)
574 OPENCV_HAL_IMPL_LASX_INIT(v_int16x16, short, s16, h, int)
575 OPENCV_HAL_IMPL_LASX_INIT(v_uint32x8, unsigned, u32, w, int)
576 OPENCV_HAL_IMPL_LASX_INIT(v_int32x8, int, s32, w, int)
577 OPENCV_HAL_IMPL_LASX_INIT(v_uint64x4, uint64, u64, d, long int)
578 OPENCV_HAL_IMPL_LASX_INIT(v_int64x4, int64, s64, d, long int)
579 
580 
581 inline __m256 _lasx_256_castsi256_ps(const __m256i &v)
582 { return __m256(v); }
583 
584 inline __m256d _lasx_256_castsi256_pd(const __m256i &v)
585 { return __m256d(v); }
586 
587 #define OPENCV_HAL_IMPL_LASX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
588  inline _Tpvec v256_setzero_##suffix() \
589  { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \
590  inline _Tpvec v256_setall_##suffix(_Tp v) \
591  { return _Tpvec(_v256_setall_##zsuffix(v)); } \
592  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
593  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, cast) \
594  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
595  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16, suffix, cast) \
596  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
597  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8, suffix, cast) \
598  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
599  OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4, suffix, cast)
600 
601 OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float32x8, float, f32, ps, _lasx_256_castsi256_ps)
602 OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float64x4, double, f64, pd, _lasx_256_castsi256_pd)
603 
604 inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
605 { return a; }
606 inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
607 { return v_float32x8(_lasx_256_castps_si256(__m256(a.val))); }
608 
609 inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
610 { return a; }
611 inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
612 { return v_float64x4(_lasx_256_castpd_si256(__m256d(a.val))); }
613 
614 
616 
617 // unpacks
618 #define OPENCV_HAL_IMPL_LASX_UNPACK(_Tpvec, suffix) \
619  inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
620  { return _Tpvec(__lasx_xvilvl_##suffix(__m256i(b.val), __m256i(a.val))); } \
621  inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
622  { return _Tpvec(__lasx_xvilvh_##suffix(__m256i(b.val), __m256i(a.val))); }
623 
624 OPENCV_HAL_IMPL_LASX_UNPACK(v_uint8x32, b)
625 OPENCV_HAL_IMPL_LASX_UNPACK(v_int8x32, b)
626 OPENCV_HAL_IMPL_LASX_UNPACK(v_uint16x16, h)
627 OPENCV_HAL_IMPL_LASX_UNPACK(v_int16x16, h)
628 OPENCV_HAL_IMPL_LASX_UNPACK(v_uint32x8, w)
629 OPENCV_HAL_IMPL_LASX_UNPACK(v_int32x8, w)
630 OPENCV_HAL_IMPL_LASX_UNPACK(v_uint64x4, d)
631 OPENCV_HAL_IMPL_LASX_UNPACK(v_int64x4, d)
632 OPENCV_HAL_IMPL_LASX_UNPACK(v_float32x8, w)
633 OPENCV_HAL_IMPL_LASX_UNPACK(v_float64x4, d)
634 
635 
636 // shuffle
637 // todo: emulate 64bit
638 #define OPENCV_HAL_IMPL_LASX_SHUFFLE(_Tpvec, intrin) \
639  template<int m> \
640  inline _Tpvec v256_shuffle(const _Tpvec& a) \
641  { return _Tpvec(__lasx_xvshuf4i_##intrin(a.val, m)); }
642 
643 OPENCV_HAL_IMPL_LASX_SHUFFLE(v_uint32x8, w)
644 OPENCV_HAL_IMPL_LASX_SHUFFLE(v_int32x8, w)
645 
646 template<int m>
647 inline v_float32x8 v256_shuffle(const v_float32x8 &a)
648 { return v_float32x8(__lasx_xvshuf4i_w(*((__m256i*)&a.val), m)); }
649 
650 template<int m>
651 inline v_float64x4 v256_shuffle(const v_float64x4 &a)
652 {
653  int imm8 = m & 0b0001; //0 or 1
654  if (m & 0x0b0010) imm8 |= 0b0100;
655  //else imm8 |= 0b0000;
656  if (m & 0x0b0100) imm8 |= 0b110000; //2 or 3
657  else imm8 |= 0b100000;
658  if (m & 0x0b1000) imm8 |= 0b11000000;
659  else imm8 |= 0b10000000;
660 
661  return v_float64x4(__lasx_xvpermi_d(*((__m256i*)&a.val), imm8));
662 }
663 template<typename _Tpvec>
664 inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
665 {
666  ab0 = v256_unpacklo(a, b);
667  ab1 = v256_unpackhi(a, b);
668 }
669 
670 template<typename _Tpvec>
671 inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
672 { return _Tpvec(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
673 
674 inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
675 { return v_float32x8(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
676 
677 inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
678 { return v_float64x4(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
679 
680 template<typename _Tpvec>
681 inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
682 { return v256_permute2x128<0x03>(a, b); }
683 
684 inline __m256i _v256_alignr_b(const __m256i &a, const __m256i &b, const int imm)
685 {
686  if (imm == 8) {
687  return __lasx_xvshuf4i_d(b, a, 0x9); // b.d1 a.d0 b.d3 a.d2
688  } else {
689  __m256i byteIndex = _v256_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
690  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
691  return __lasx_xvshuf_b(a, b, __lasx_xvadd_b(__lasx_xvreplgr2vr_b(imm), byteIndex));
692  }
693 }
694 
695 template<typename _Tpvec>
696 inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
697 { return _Tpvec(_v256_alignr_b(a.val, b.val, 8)); }
698 inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
699 { return v_float64x4(__lasx_xvshuf4i_d(b.val, a.val, 0x9)); } // b.d1 a.d0 b.d3 a.d2
700 // todo: emulate float32
701 
702 template<typename _Tpvec>
703 inline _Tpvec v256_swap_halves(const _Tpvec& a)
704 { return v256_permute2x128<1>(a, a); }
705 
706 template<typename _Tpvec>
707 inline _Tpvec v256_reverse_64(const _Tpvec& a)
708 { return v256_permute4x64<0x1b>(a); }
709 
710 
711 // ZIP
712 #define OPENCV_HAL_IMPL_LASX_ZIP(_Tpvec) \
713  inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
714  { return v256_permute2x128<0x02>(a, b); } \
715  inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
716  { return v256_permute2x128<0x13>(a, b); } \
717  inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
718  _Tpvec& c, _Tpvec& d) \
719  { \
720  _Tpvec a1b0 = v256_alignr_128(a, b); \
721  c = v256_combine_diagonal(a, a1b0); \
722  d = v256_combine_diagonal(a1b0, b); \
723  } \
724  inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
725  _Tpvec& ab0, _Tpvec& ab1) \
726  { \
727  _Tpvec ab0ab2, ab1ab3; \
728  v256_zip(a, b, ab0ab2, ab1ab3); \
729  v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
730  }
731 
732 OPENCV_HAL_IMPL_LASX_ZIP(v_uint8x32)
733 OPENCV_HAL_IMPL_LASX_ZIP(v_int8x32)
734 OPENCV_HAL_IMPL_LASX_ZIP(v_uint16x16)
735 OPENCV_HAL_IMPL_LASX_ZIP(v_int16x16)
736 OPENCV_HAL_IMPL_LASX_ZIP(v_uint32x8)
737 OPENCV_HAL_IMPL_LASX_ZIP(v_int32x8)
738 OPENCV_HAL_IMPL_LASX_ZIP(v_uint64x4)
739 OPENCV_HAL_IMPL_LASX_ZIP(v_int64x4)
740 OPENCV_HAL_IMPL_LASX_ZIP(v_float32x8)
741 OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
742 
743 
746 #define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin) \
747  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
748  { return _Tpvec(intrin(a.val, b.val)); } \
749  inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
750  { a.val = intrin(a.val, b.val); return a; }
751 
752 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint8x32, __lasx_xvsadd_bu)
753 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint8x32, __lasx_xvssub_bu)
754 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int8x32, __lasx_xvsadd_b)
755 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int8x32, __lasx_xvssub_b)
756 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint16x16, __lasx_xvsadd_hu)
757 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint16x16, __lasx_xvssub_hu)
758 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int16x16, __lasx_xvsadd_h)
759 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int16x16, __lasx_xvssub_h)
760 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint32x8, __lasx_xvadd_w)
761 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint32x8, __lasx_xvsub_w)
762 OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_uint32x8, __lasx_xvmul_w)
763 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int32x8, __lasx_xvadd_w)
764 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int32x8, __lasx_xvsub_w)
765 OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_int32x8, __lasx_xvmul_w)
766 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_uint64x4, __lasx_xvadd_d)
767 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_uint64x4, __lasx_xvsub_d)
768 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_int64x4, __lasx_xvadd_d)
769 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_int64x4, __lasx_xvsub_d)
770 
771 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float32x8, __lasx_xvfadd_s)
772 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float32x8, __lasx_xvfsub_s)
773 OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float32x8, __lasx_xvfmul_s)
774 OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float32x8, __lasx_xvfdiv_s)
775 OPENCV_HAL_IMPL_LASX_BIN_OP(+, v_float64x4, __lasx_xvfadd_d)
776 OPENCV_HAL_IMPL_LASX_BIN_OP(-, v_float64x4, __lasx_xvfsub_d)
777 OPENCV_HAL_IMPL_LASX_BIN_OP(*, v_float64x4, __lasx_xvfmul_d)
778 OPENCV_HAL_IMPL_LASX_BIN_OP(/, v_float64x4, __lasx_xvfdiv_d)
779 
780 // saturating multiply 8-bit, 16-bit
781 inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
782 {
783  v_uint16x16 c, d;
784  v_mul_expand(a, b, c, d);
785  return v_pack(c, d);
786 }
787 inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
788 {
789  v_int16x16 c, d;
790  v_mul_expand(a, b, c, d);
791  return v_pack(c, d);
792 }
793 inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
794 {
795  __m256i pl = __lasx_xvmul_h(a.val, b.val);
796  __m256i ph = __lasx_xvmuh_hu(a.val, b.val);
797  __m256i p0 = __lasx_xvilvl_h(ph, pl);
798  __m256i p1 = __lasx_xvilvh_h(ph, pl);
799  return v_uint16x16(_v256_packs_epu32(p0, p1));
800 }
801 inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
802 {
803  __m256i pl = __lasx_xvmul_h(a.val, b.val);
804  __m256i ph = __lasx_xvmuh_h(a.val, b.val);
805  __m256i p0 = __lasx_xvilvl_h(ph, pl);
806  __m256i p1 = __lasx_xvilvh_h(ph, pl);
807  return v_int16x16(_lasx_packs_w(p0, p1));
808 }
809 inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
810 { a = a * b; return a; }
811 inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
812 { a = a * b; return a; }
813 inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
814 { a = a * b; return a; }
815 inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
816 { a = a * b; return a; }
817 
820 #define OPENCV_HAL_IMPL_LASX_BIN_FUNC(func, _Tpvec, intrin) \
821  inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
822  { return _Tpvec(intrin(a.val, b.val)); }
823 
824 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint8x32, __lasx_xvadd_b)
825 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int8x32, __lasx_xvadd_b)
826 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint16x16, __lasx_xvadd_h)
827 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int16x16, __lasx_xvadd_h)
828 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint8x32, __lasx_xvsub_b)
829 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int8x32, __lasx_xvsub_b)
830 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint16x16, __lasx_xvsub_h)
831 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int16x16, __lasx_xvsub_h)
832 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_uint16x16, __lasx_xvmul_h)
833 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16, __lasx_xvmul_h)
834 
835 inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
836 {
837  __m256i p0 = __lasx_xvmulwev_h_bu(a.val, b.val);
838  __m256i p1 = __lasx_xvmulwod_h_bu(a.val, b.val);
839  return v_uint8x32(__lasx_xvpackev_b(p1, p0));
840 }
841 
842 inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
843 {
844  return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
845 }
846 
847 // Multiply and expand
848 inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
849  v_uint16x16& c, v_uint16x16& d)
850 {
851  v_uint16x16 a0, a1, b0, b1;
852  v_expand(a, a0, a1);
853  v_expand(b, b0, b1);
854  c = v_mul_wrap(a0, b0);
855  d = v_mul_wrap(a1, b1);
856 }
857 
858 inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
859  v_int16x16& c, v_int16x16& d)
860 {
861  v_int16x16 a0, a1, b0, b1;
862  v_expand(a, a0, a1);
863  v_expand(b, b0, b1);
864  c = v_mul_wrap(a0, b0);
865  d = v_mul_wrap(a1, b1);
866 }
867 
868 inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
869  v_int32x8& c, v_int32x8& d)
870 {
871  v_int16x16 vhi = v_int16x16(__lasx_xvmuh_h(a.val, b.val));
872 
873  v_int16x16 v0, v1;
874  v_zip(v_mul_wrap(a, b), vhi, v0, v1);
875 
876  c = v_reinterpret_as_s32(v0);
877  d = v_reinterpret_as_s32(v1);
878 }
879 
880 inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
881  v_uint32x8& c, v_uint32x8& d)
882 {
883  v_uint16x16 vhi = v_uint16x16(__lasx_xvmuh_hu(a.val, b.val));
884 
885  v_uint16x16 v0, v1;
886  v_zip(v_mul_wrap(a, b), vhi, v0, v1);
887 
888  c = v_reinterpret_as_u32(v0);
889  d = v_reinterpret_as_u32(v1);
890 }
891 
892 inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
893  v_uint64x4& c, v_uint64x4& d)
894 {
895  __m256i v0 = __lasx_xvmulwev_d_wu(a.val, b.val);
896  __m256i v1 = __lasx_xvmulwod_d_wu(a.val, b.val);
897  v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
898 }
899 
900 inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(__lasx_xvmuh_h(a.val, b.val)); }
901 inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(__lasx_xvmuh_hu(a.val, b.val)); }
902 
904 #define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
905  inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
906  { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
907  inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
908  { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
909  inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
910  { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
911  inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
912  { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
913  template<int imm> \
914  inline _Tpuvec v_shl(const _Tpuvec& a) \
915  { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
916  template<int imm> \
917  inline _Tpsvec v_shl(const _Tpsvec& a) \
918  { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
919  template<int imm> \
920  inline _Tpuvec v_shr(const _Tpuvec& a) \
921  { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); } \
922  template<int imm> \
923  inline _Tpsvec v_shr(const _Tpsvec& a) \
924  { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }
925 
926 OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h)
927 OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8, v_int32x8, w, __lasx_xvsra_w)
928 OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4, v_int64x4, d, __lasx_xvsra_d)
929 
930 
931 
932 #define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const) \
933  OPENCV_HAL_IMPL_LASX_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix) \
934  OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix) \
935  OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix) \
936  inline _Tpvec operator ~ (const _Tpvec& a) \
937  { return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
938 
939 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32, v, __lasx_xvreplgr2vr_w(-1))
940 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32, v, __lasx_xvreplgr2vr_w(-1))
941 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint16x16, v, __lasx_xvreplgr2vr_w(-1))
942 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int16x16, v, __lasx_xvreplgr2vr_w(-1))
943 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint32x8, v, __lasx_xvreplgr2vr_w(-1))
944 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int32x8, v, __lasx_xvreplgr2vr_w(-1))
945 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4, v, __lasx_xvreplgr2vr_d(-1))
946 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4, v, __lasx_xvreplgr2vr_d(-1))
947 
948 #define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
949  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
950  { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); } \
951  inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
952  { __m256i c = intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val))); a.val = cast(c); return a; }
953 
954 #define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast) \
955  OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(&, _Tpvec, __lasx_xvand_##suffix, cast) \
956  OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix, cast) \
957  OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix, cast) \
958  inline _Tpvec operator ~ (const _Tpvec& a) \
959  { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
960 
961 OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8, v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
962 OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float64x4, v, __lasx_xvreplgr2vr_d(-1), _lasx_256_castsi256_pd)
963 
964 
965 #define OPENCV_HAL_IMPL_LASX_SELECT(_Tpvec) \
966  inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
967  { return _Tpvec(__lasx_xvbitsel_v(b.val, a.val, mask.val)); }
968 
969 OPENCV_HAL_IMPL_LASX_SELECT(v_uint8x32)
970 OPENCV_HAL_IMPL_LASX_SELECT(v_int8x32)
971 OPENCV_HAL_IMPL_LASX_SELECT(v_uint16x16)
972 OPENCV_HAL_IMPL_LASX_SELECT(v_int16x16)
973 OPENCV_HAL_IMPL_LASX_SELECT(v_uint32x8)
974 OPENCV_HAL_IMPL_LASX_SELECT(v_int32x8)
975 
976 inline v_float32x8 v_select(const v_float32x8 &mask, const v_float32x8 &a, const v_float32x8 &b)
977 { return v_float32x8(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); }
978 
979 inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const v_float64x4 &b)
980 { return v_float64x4(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); }
981 
983 #define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec) \
984  inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
985  { return ~(a == b); } \
986  inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
987  { return b > a; } \
988  inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
989  { return ~(a < b); } \
990  inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
991  { return b >= a; }
992 
993 #define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
994  inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
995  { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
996  inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
997  { \
998  return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val)); \
999  } \
1000  inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1001  { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
1002  inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1003  { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); } \
1004  OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec) \
1005  OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
1006 
1007 OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint8x32, v_int8x32, b, bu)
1008 OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
1009 OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8, v_int32x8, w, wu)
1010 
1011 #define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix) \
1012  inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1013  { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); } \
1014  inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1015  { return ~(a == b); }
1016 
1017 OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
1018 OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
1019 
1020 #define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
1021  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1022  { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
1023 
1024 #define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix) \
1025  OPENCV_HAL_IMPL_LASX_CMP_FLT(==, xvfcmp_ceq, _Tpvec, ssuffix) \
1026  OPENCV_HAL_IMPL_LASX_CMP_FLT(!=, xvfcmp_cne, _Tpvec, ssuffix) \
1027  OPENCV_HAL_IMPL_LASX_CMP_FLT(<, xvfcmp_clt, _Tpvec, ssuffix) \
1028  OPENCV_HAL_IMPL_LASX_CMP_FLT(<=, xvfcmp_cle, _Tpvec, ssuffix)
1029 
1030 OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
1031 OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
1032 
1033 inline v_float32x8 operator > (const v_float32x8 &a, const v_float32x8 &b)
1034 { return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
1035 
1036 inline v_float32x8 operator >= (const v_float32x8 &a, const v_float32x8 &b)
1037 { return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
1038 
1039 inline v_float64x4 operator > (const v_float64x4 &a, const v_float64x4 &b)
1040 { return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
1041 
1042 inline v_float64x4 operator >= (const v_float64x4 &a, const v_float64x4 &b)
1043 { return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
1044 
1045 inline v_float32x8 v_not_nan(const v_float32x8& a)
1046 { return v_float32x8(__lasx_xvfcmp_cor_s(a.val, a.val)); }
1047 inline v_float64x4 v_not_nan(const v_float64x4& a)
1048 { return v_float64x4(__lasx_xvfcmp_cor_d(a.val, a.val)); }
1049 
1051 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint8x32, __lasx_xvmin_bu)
1052 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint8x32, __lasx_xvmax_bu)
1053 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int8x32, __lasx_xvmin_b)
1054 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int8x32, __lasx_xvmax_b)
1055 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint16x16, __lasx_xvmin_hu)
1056 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint16x16, __lasx_xvmax_hu)
1057 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int16x16, __lasx_xvmin_h)
1058 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int16x16, __lasx_xvmax_h)
1059 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint32x8, __lasx_xvmin_wu)
1060 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint32x8, __lasx_xvmax_wu)
1061 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int32x8, __lasx_xvmin_w)
1062 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int32x8, __lasx_xvmax_w)
1063 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float32x8, __lasx_xvfmin_s)
1064 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float32x8, __lasx_xvfmax_s)
1065 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float64x4, __lasx_xvfmin_d)
1066 OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float64x4, __lasx_xvfmax_d)
1067 
1068 
1069 template<int imm>
1070 inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
1071 {
1072  enum {IMM_R = (16 - imm) & 0xFF};
1073  enum {IMM_R2 = (32 - imm) & 0xFF};
1074 
1075  if (imm == 0) return a;
1076  if (imm == 32) return b;
1077  if (imm > 32) return v_uint8x32();
1078 
1079  __m256i swap = _v256_permute2x128<0x21>(a.val, b.val);
1080  if (imm == 16) return v_uint8x32(swap);
1081  if (imm < 16) return v_uint8x32(_v256_alignr_b(a.val, swap, IMM_R));
1082  return v_uint8x32(_v256_alignr_b(swap, b.val, IMM_R2)); // imm < 32
1083 }
1084 
1085 template<int imm>
1086 inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
1087 {
1088  enum {IMM_L = (imm - 16) & 0xFF};
1089 
1090  if (imm == 0) return a;
1091  if (imm == 32) return b;
1092  if (imm > 32) return v_uint8x32();
1093 
1094  __m256i swap = _v256_permute2x128<0x03>(a.val, b.val);
1095  if (imm == 16) return v_uint8x32(swap);
1096  if (imm < 16) return v_uint8x32(_v256_alignr_b(swap, a.val, imm));
1097  return v_uint8x32(_v256_alignr_b(b.val, swap, IMM_L));
1098 }
1099 
1100 template<int imm>
1101 inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
1102 {
1103  enum {IMM_L = (imm - 16) & 0xFF};
1104  enum {IMM_R = (16 - imm) & 0xFF};
1105 
1106  if (imm == 0) return a;
1107  if (imm > 32) return v_uint8x32();
1108 
1109  // ESAC control[3] ? [127:0] = 0
1110  __m256i vzero = __lasx_xvreplgr2vr_w(0);
1111  __m256i swapz = __lasx_xvpermi_q(a.val, vzero, 0x20);;
1112  if (imm == 16) return v_uint8x32(swapz);
1113  if (imm < 16) return v_uint8x32(_v256_alignr_b(a.val, swapz, IMM_R));
1114  return v_uint8x32(__lasx_xvbsll_v(swapz, IMM_L));
1115 }
1116 
1117 template<int imm>
1118 inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
1119 {
1120  enum {IMM_L = (imm - 16) & 0xFF};
1121 
1122  if (imm == 0) return a;
1123  if (imm > 32) return v_uint8x32();
1124 
1125  // ESAC control[3] ? [127:0] = 0
1126  __m256i vzero = __lasx_xvreplgr2vr_w(0);
1127  __m256i swapz = __lasx_xvpermi_q(vzero, a.val, 0x21);;
1128  if (imm == 16) return v_uint8x32(swapz);
1129  if (imm < 16) return v_uint8x32(_v256_alignr_b(swapz, a.val, imm));
1130  return v_uint8x32(__lasx_xvbsrl_v(swapz, IMM_L));
1131 }
1132 
1133 #define OPENCV_HAL_IMPL_LASX_ROTATE_CAST(intrin, _Tpvec, cast) \
1134  template<int imm> \
1135  inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
1136  { \
1137  enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1138  v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a), \
1139  v_reinterpret_as_u8(b)); \
1140  return _Tpvec(cast(ret.val)); \
1141  } \
1142  template<int imm> \
1143  inline _Tpvec intrin(const _Tpvec& a) \
1144  { \
1145  enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)}; \
1146  v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a)); \
1147  return _Tpvec(cast(ret.val)); \
1148  }
1149 
1150 #define OPENCV_HAL_IMPL_LASX_ROTATE(_Tpvec) \
1151  OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
1152  OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
1153 
1154 OPENCV_HAL_IMPL_LASX_ROTATE(v_int8x32)
1155 OPENCV_HAL_IMPL_LASX_ROTATE(v_uint16x16)
1156 OPENCV_HAL_IMPL_LASX_ROTATE(v_int16x16)
1157 OPENCV_HAL_IMPL_LASX_ROTATE(v_uint32x8)
1158 OPENCV_HAL_IMPL_LASX_ROTATE(v_int32x8)
1159 OPENCV_HAL_IMPL_LASX_ROTATE(v_uint64x4)
1160 OPENCV_HAL_IMPL_LASX_ROTATE(v_int64x4)
1161 
1162 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, v_float32x8, _lasx_256_castsi256_ps)
1163 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float32x8, _lasx_256_castsi256_ps)
1164 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left, v_float64x4, _lasx_256_castsi256_pd)
1165 OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float64x4, _lasx_256_castsi256_pd)
1166 
1167 
1168 inline v_uint8x32 v_reverse(const v_uint8x32 &a)
1169 {
1170  static const __m256i perm = _v256_setr_b(
1171  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
1172  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1173  __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
1174  return v_uint8x32(__lasx_xvpermi_q(vec, vec, 1));
1175 }
1176 
1177 inline v_int8x32 v_reverse(const v_int8x32 &a)
1178 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1179 
1180 inline v_uint16x16 v_reverse(const v_uint16x16 &a)
1181 {
1182  __m256i vec = __lasx_xvshuf4i_h(a.val, 0x1B);
1183  vec = __lasx_xvshuf4i_w(vec, 0x4E);
1184  return v_uint16x16(__lasx_xvpermi_d(vec, 0x4E));
1185 }
1186 
1187 inline v_int16x16 v_reverse(const v_int16x16 &a)
1188 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1189 
1190 inline v_uint32x8 v_reverse(const v_uint32x8 &a)
1191 {
1192  __m256i vec = __lasx_xvshuf4i_w(a.val, 0x1B);
1193  return v_uint32x8(__lasx_xvpermi_d(vec, 0x4E));
1194 }
1195 
1196 inline v_int32x8 v_reverse(const v_int32x8 &a)
1197 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1198 
1199 inline v_float32x8 v_reverse(const v_float32x8 &a)
1200 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1201 
1202 inline v_uint64x4 v_reverse(const v_uint64x4 &a)
1203 {
1204  return v_uint64x4(__lasx_xvpermi_d(a.val, 0x1b));
1205 }
1206 
1207 inline v_int64x4 v_reverse(const v_int64x4 &a)
1208 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1209 
1210 inline v_float64x4 v_reverse(const v_float64x4 &a)
1211 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1212 
1214 
1216 // this function is return a[0]+a[1]+...+a[31]
1217 inline unsigned v_reduce_sum(const v_uint8x32& a)
1218 {
1219  __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val);
1220  __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1221  __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1222  __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1223  return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1224 }
1225 
1226 inline int v_reduce_sum(const v_int8x32& a)
1227 {
1228  __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val);
1229  __m256i t2 = __lasx_xvhaddw_w_h(t1, t1);
1230  __m256i t3 = __lasx_xvhaddw_d_w(t2, t2);
1231  __m256i t4 = __lasx_xvhaddw_q_d(t3, t3);
1232  return (int)(((v8i32)t4)[0]+((v8i32)t4)[4]);
1233 }
1234 
1235 #define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \
1236  inline sctype v_reduce_##func(const _Tpvec& a) \
1237  { \
1238  __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
1239  val = intrin(val, __lsx_vbsrl_v(val,8)); \
1240  val = intrin(val, __lsx_vbsrl_v(val,4)); \
1241  val = intrin(val, __lsx_vbsrl_v(val,2)); \
1242  val = intrin(val, __lsx_vbsrl_v(val,1)); \
1243  return (sctype)__lsx_vpickve2gr_w(val, 0); \
1244  }
1245 
1246 OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, min, __lsx_vmin_bu)
1247 OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32, schar, min, __lsx_vmin_b)
1248 OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, max, __lsx_vmax_bu)
1249 OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32, schar, max, __lsx_vmax_b)
1250 
1251 #define OPENCV_HAL_IMPL_LASX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1252  inline sctype v_reduce_##func(const _Tpvec& a) \
1253  { \
1254  __m128i v0 = _v256_extract_low(a.val); \
1255  __m128i v1 = _v256_extract_high(a.val); \
1256  v0 = intrin(v0, v1); \
1257  v0 = intrin(v0, __lsx_vbsrl_v(v0, 8)); \
1258  v0 = intrin(v0, __lsx_vbsrl_v(v0, 4)); \
1259  v0 = intrin(v0, __lsx_vbsrl_v(v0, 2)); \
1260  return (sctype) __lsx_vpickve2gr_w(v0, 0); \
1261  }
1262 
1263 OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, min, __lsx_vmin_hu)
1264 OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16, short, min, __lsx_vmin_h)
1265 OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, max, __lsx_vmax_hu)
1266 OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16, short, max, __lsx_vmax_h)
1267 
1268 #define OPENCV_HAL_IMPL_LASX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1269  inline sctype v_reduce_##func(const _Tpvec& a) \
1270  { \
1271  __m128i v0 = _v256_extract_low(a.val); \
1272  __m128i v1 = _v256_extract_high(a.val); \
1273  v0 = intrin(v0, v1); \
1274  v0 = intrin(v0, __lsx_vbsrl_v(v0, 8)); \
1275  v0 = intrin(v0, __lsx_vbsrl_v(v0, 4)); \
1276  return (sctype) __lsx_vpickve2gr_w(v0, 0); \
1277  }
1278 
1279 OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, min, __lsx_vmin_wu)
1280 OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8, int, min, __lsx_vmin_w)
1281 OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, max, __lsx_vmax_wu)
1282 OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8, int, max, __lsx_vmax_w)
1283 
1284 #define OPENCV_HAL_IMPL_LASX_REDUCE_FLT(func, intrin) \
1285  inline float v_reduce_##func(const v_float32x8& a) \
1286  { \
1287  __m128 v0 = _v256_extract_low(a.val); \
1288  __m128 v1 = _v256_extract_high(a.val); \
1289  v0 = intrin(v0, v1); \
1290  v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x0e))); \
1291  v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x01))); \
1292  float *fvalue = (float*)&v0; \
1293  return fvalue[0]; \
1294  }
1295 
1296 OPENCV_HAL_IMPL_LASX_REDUCE_FLT(min, __lsx_vfmin_s)
1297 OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s)
1298 
1299 inline int v_reduce_sum(const v_int32x8& a)
1300 {
1301  __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val);
1302  __m256i t2 = __lasx_xvhaddw_q_d(t1, t1);
1303  return (int)(((v8i32)t2)[0]+((v8i32)t2)[4]);
1304 }
1305 
1306 inline unsigned v_reduce_sum(const v_uint32x8& a)
1307 { return v_reduce_sum(v_reinterpret_as_s32(a)); }
1308 
1309 inline int v_reduce_sum(const v_int16x16& a)
1310 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1311 inline unsigned v_reduce_sum(const v_uint16x16& a)
1312 { return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1313 
1314 inline float v_reduce_sum(const v_float32x8& a)
1315 {
1316  float result = 0;
1317  float *pa = (float*)&a;
1318  for (int i = 0; i < 2; ++i) {
1319  result += pa[i*4] + pa[i*4+1] + pa[i*4+2] + pa[i*4+3];
1320  }
1321  return result;
1322 }
1323 
1324 inline uint64 v_reduce_sum(const v_uint64x4& a)
1325 {
1326  __m256i t0 = __lasx_xvhaddw_qu_du(a.val, a.val);
1327  return (uint64)(((v4u64)t0)[0] + ((v4u64)t0)[2]);
1328 }
1329 inline int64 v_reduce_sum(const v_int64x4& a)
1330 {
1331  __m256i t0 = __lasx_xvhaddw_q_d(a.val, a.val);
1332  return (int64)(((v4i64)t0)[0] + ((v4i64)t0)[2]);
1333 }
1334 inline double v_reduce_sum(const v_float64x4& a)
1335 {
1336  double *pa = (double*)&a;
1337  return pa[0] + pa[1] + pa[2] + pa[3];
1338 }
1339 
1340 inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
1341  const v_float32x8& c, const v_float32x8& d)
1342 {
1343  float *pa = (float*)&a;
1344  float *pb = (float*)&b;
1345  float *pc = (float*)&c;
1346  float *pd = (float*)&d;
1347 
1348  float v0 = pa[0] + pa[1] + pa[2] + pa[3];
1349  float v1 = pb[0] + pb[1] + pb[2] + pb[3];
1350  float v2 = pc[0] + pc[1] + pc[2] + pc[3];
1351  float v3 = pd[0] + pd[1] + pd[2] + pd[3];
1352  float v4 = pa[4] + pa[5] + pa[6] + pa[7];
1353  float v5 = pb[4] + pb[5] + pb[6] + pb[7];
1354  float v6 = pc[4] + pc[5] + pc[6] + pc[7];
1355  float v7 = pd[4] + pd[5] + pd[6] + pd[7];
1356  return v_float32x8(v0, v1, v2, v3, v4, v5, v6, v7);
1357 }
1358 
1359 inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
1360 {
1361  __m256i t0 = __lasx_xvabsd_bu(a.val, b.val);
1362  __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
1363  __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1364  __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1365  __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1366  return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1367 }
1368 inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
1369 {
1370  __m256i t0 = __lasx_xvabsd_b(a.val, b.val);
1371  __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
1372  __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
1373  __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
1374  __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
1375  return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
1376 }
1377 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
1378 {
1379  v_uint32x8 l, h;
1380  v_expand(v_add_wrap(a - b, b - a), l, h);
1381  return v_reduce_sum(l + h);
1382 }
1383 inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
1384 {
1385  v_uint32x8 l, h;
1386  v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
1387  return v_reduce_sum(l + h);
1388 }
1389 inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
1390 {
1391  return v_reduce_sum(v_max(a, b) - v_min(a, b));
1392 }
1393 inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
1394 {
1395  v_int32x8 m = a < b;
1396  return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
1397 }
1398 inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
1399 {
1400  v_float32x8 a_b = a - b;
1401  return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
1402 }
1403 
1405 inline v_uint8x32 v_popcount(const v_uint8x32& a)
1406 { return v_uint8x32(__lasx_xvpcnt_b(a.val)); }
1407 inline v_uint16x16 v_popcount(const v_uint16x16& a)
1408 { return v_uint16x16(__lasx_xvpcnt_h(a.val)); }
1409 inline v_uint32x8 v_popcount(const v_uint32x8& a)
1410 { return v_uint32x8(__lasx_xvpcnt_w(a.val)); }
1411 inline v_uint64x4 v_popcount(const v_uint64x4& a)
1412 { return v_uint64x4(__lasx_xvpcnt_d(a.val)); }
1413 inline v_uint8x32 v_popcount(const v_int8x32& a)
1414 { return v_popcount(v_reinterpret_as_u8(a)); }
1415 inline v_uint16x16 v_popcount(const v_int16x16& a)
1416 { return v_popcount(v_reinterpret_as_u16(a)); }
1417 inline v_uint32x8 v_popcount(const v_int32x8& a)
1418 { return v_popcount(v_reinterpret_as_u32(a)); }
1419 inline v_uint64x4 v_popcount(const v_int64x4& a)
1420 { return v_popcount(v_reinterpret_as_u64(a)); }
1421 
1422 inline int v_signmask(const v_int8x32& a)
1423 {
1424  __m256i result = __lasx_xvmskltz_b(a.val);
1425  int mask = __lasx_xvpickve2gr_w(result, 0);
1426  mask |= (__lasx_xvpickve2gr_w(result, 4) << 16);
1427  return mask;
1428 }
1429 inline int v_signmask(const v_uint8x32& a)
1430 { return v_signmask(v_reinterpret_as_s8(a)); }
1431 
1432 inline int v_signmask(const v_int16x16& a)
1433 { return v_signmask(v_pack(a, a)) & 0xFFFF; }
1434 inline int v_signmask(const v_uint16x16& a)
1435 { return v_signmask(v_reinterpret_as_s16(a)); }
1436 
1437 inline int v_signmask(const v_int32x8& a)
1438 {
1439  __m256i result = __lasx_xvmskltz_w(a.val);
1440  int mask = __lasx_xvpickve2gr_w(result, 0);
1441  mask |= (__lasx_xvpickve2gr_w(result, 4) << 4);
1442  return mask;
1443 }
1444 inline int v_signmask(const v_uint32x8& a)
1445 { return v_signmask(*(v_int32x8*)(&a)); }
1446 
1447 inline int v_signmask(const v_int64x4& a)
1448 {
1449  __m256i result = __lasx_xvmskltz_d(a.val);
1450  int mask = __lasx_xvpickve2gr_d(result, 0);
1451  mask |= (__lasx_xvpickve2gr_w(result, 4) << 2);
1452  return mask;
1453 }
1454 inline int v_signmask(const v_uint64x4& a)
1455 { return v_signmask(v_reinterpret_as_s64(a)); }
1456 
1457 inline int v_signmask(const v_float32x8& a)
1458 { return v_signmask(*(v_int32x8*)(&a)); }
1459 
1460 inline int v_signmask(const v_float64x4& a)
1461 { return v_signmask(*(v_int64x4*)(&a)); }
1462 
1463 inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1464 inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1465 inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1466 inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1467 inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1468 inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1469 inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1470 inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1471 inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1472 inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1473 
1475 #define OPENCV_HAL_IMPL_LASX_CHECK(_Tpvec, allmask) \
1476  inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1477  inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1478 OPENCV_HAL_IMPL_LASX_CHECK(v_uint8x32, -1)
1479 OPENCV_HAL_IMPL_LASX_CHECK(v_int8x32, -1)
1480 OPENCV_HAL_IMPL_LASX_CHECK(v_uint32x8, 255)
1481 OPENCV_HAL_IMPL_LASX_CHECK(v_int32x8, 255)
1482 OPENCV_HAL_IMPL_LASX_CHECK(v_uint64x4, 15)
1483 OPENCV_HAL_IMPL_LASX_CHECK(v_int64x4, 15)
1484 OPENCV_HAL_IMPL_LASX_CHECK(v_float32x8, 255)
1485 OPENCV_HAL_IMPL_LASX_CHECK(v_float64x4, 15)
1486 
1487 #define OPENCV_HAL_IMPL_LASX_CHECK_SHORT(_Tpvec) \
1488  inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
1489  inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
1490 OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_uint16x16)
1491 OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
1492 
1493 
1496 #define OPENCV_HAL_IMPL_LASX_MULADD(_Tpvec, suffix) \
1497  inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1498  { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); } \
1499  inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1500  { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); } \
1501  inline _Tpvec v_sqrt(const _Tpvec& x) \
1502  { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); } \
1503  inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1504  { return v_fma(a, a, b * b); } \
1505  inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1506  { return v_sqrt(v_fma(a, a, b*b)); }
1507 
1508 OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
1509 OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
1510 
1511 inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
1512 {
1513  return v_int32x8(__lasx_xvmadd_w(c.val, a.val, b.val));
1514 }
1515 
1516 inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
1517 {
1518  return v_fma(a, b, c);
1519 }
1520 
1521 inline v_float32x8 v_invsqrt(const v_float32x8& x)
1522 { return v_float32x8(__lasx_xvfrsqrt_s(x.val)); }
1523 
1524 inline v_float64x4 v_invsqrt(const v_float64x4& x)
1525 { return v_float64x4(__lasx_xvfrsqrt_d(x.val)); }
1526 
1528 #define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix) \
1529  inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1530  { return v_u##_Tpvec(__lasx_xvabsd_##suffix(x.val, __lasx_xvreplgr2vr_w(0))); }
1531 
1532 OPENCV_HAL_IMPL_LASX_ABS(int8x32, b)
1533 OPENCV_HAL_IMPL_LASX_ABS(int16x16, h)
1534 OPENCV_HAL_IMPL_LASX_ABS(int32x8, w)
1535 
1536 inline v_float32x8 v_abs(const v_float32x8& x)
1537 { return v_float32x8(*((__m256i*)&x) & __lasx_xvreplgr2vr_w(0x7fffffff)); }
1538 inline v_float64x4 v_abs(const v_float64x4& x)
1539 { return v_float64x4(*((__m256i*)&x) & __lasx_xvreplgr2vr_d(0x7fffffffffffffff)); }
1540 
1542 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
1543 { return (v_uint8x32)__lasx_xvabsd_bu(a.val, b.val); }
1544 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
1545 { return (v_uint16x16)__lasx_xvabsd_hu(a.val, b.val); }
1546 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
1547 { return (v_uint32x8)__lasx_xvabsd_wu(a.val, b.val); }
1548 
1549 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
1550 { return (v_uint8x32)__lasx_xvabsd_b(a.val, b.val); }
1551 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
1552 { return (v_uint16x16)__lasx_xvabsd_h(a.val, b.val); }
1553 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
1554 { return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
1555 
1556 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
1557 { return v_abs(a - b); }
1558 
1559 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
1560 { return v_abs(a - b); }
1561 
1563 inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
1564 {
1565  v_int8x32 d = a - b;
1566  v_int8x32 m = a < b;
1567  return (d ^ m) - m;
1568 }
1569 inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
1570 { return v_max(a, b) - v_min(a, b); }
1571 
1573 
1575 inline v_int32x8 v_round(const v_float32x8& a)
1576 { return v_int32x8(__lasx_xvftint_w_s(a.val)); }
1577 
1578 inline v_int32x8 v_round(const v_float64x4& a)
1579 { __m256i t = __lasx_xvftint_w_d(a.val, a.val);
1580  return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
1581 
1582 inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
1583 {
1584  __m256i abi = __lasx_xvftint_w_d(b.val, a.val);
1585  return v_int32x8(__lasx_xvpermi_d(abi, 0b11011000)); //3120
1586 }
1587 
1588 inline v_int32x8 v_trunc(const v_float32x8& a)
1589 { return v_int32x8(__lasx_xvftintrz_w_s(a.val)); }
1590 
1591 inline v_int32x8 v_trunc(const v_float64x4& a)
1592 { __m256i t = __lasx_xvftintrz_w_d(a.val, a.val);
1593  return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
1594 
1595 inline v_int32x8 v_floor(const v_float32x8& a)
1596 { return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrm_s(a.val)))); }
1597 
1598 inline v_int32x8 v_floor(const v_float64x4& a)
1599 { return v_trunc(v_float64x4(__lasx_xvfrintrm_d(a.val))); }
1600 
1601 inline v_int32x8 v_ceil(const v_float32x8& a)
1602 { return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrp_s(a.val)))); }
1603 
1604 inline v_int32x8 v_ceil(const v_float64x4& a)
1605 { return v_trunc(v_float64x4(__lasx_xvfrintrp_d(a.val))); }
1606 
1608 inline v_float32x8 v_cvt_f32(const v_int32x8& a)
1609 { return v_float32x8(__lasx_xvffint_s_w(a.val)); }
1610 
1611 inline v_float32x8 v_cvt_f32(const v_float64x4& a)
1612 { return v_float32x8(__lasx_xvpermi_d(__lasx_xvfcvt_s_d(a.val, a.val), 0x88)); }
1613 
1614 inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
1615 {
1616  __m256 abf = __lasx_xvfcvt_s_d(a.val, b.val); //warnning: order of a,b is diff from instruction xvfcvt.s.d
1617  return v_float32x8(__lasx_xvpermi_d(abf, 0x8D));
1618 }
1619 
1620 inline v_float64x4 v_cvt_f64(const v_int32x8& a)
1621 {
1622  __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
1623  return v_float64x4(__lasx_xvffintl_d_w(alow));
1624 }
1625 
1626 inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
1627 {
1628  __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
1629  return v_float64x4(__lasx_xvffintl_d_w(ahigh));
1630 }
1631 
1632 inline v_float64x4 v_cvt_f64(const v_float32x8& a)
1633 {
1634  __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
1635  return v_float64x4(__lasx_xvfcvtl_d_s((__m256)alow));
1636 }
1637 
1638 inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
1639 {
1640  __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
1641  return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh));
1642 }
1643 
1644 inline v_float64x4 v_cvt_f64(const v_int64x4& v)
1645 { return v_float64x4(__lasx_xvffint_d_l(v.val)); }
1646 
1648 
1649 inline v_int8x32 v256_lut(const schar* tab, const int* idx)
1650 {
1651  return v_int8x32(_v256_setr_b(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]],
1652  tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]],
1653  tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]], tab[idx[16]], tab[idx[17]],
1654  tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
1655  tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]],
1656  tab[idx[30]], tab[idx[31]]));
1657 }
1658 inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
1659 {
1660  return v_int8x32(_v256_setr_h(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]),
1661  *(const short*)(tab + idx[ 3]), *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]),
1662  *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]), *(const short*)(tab + idx[ 8]),
1663  *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
1664  *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]),
1665  *(const short*)(tab + idx[15])));
1666 }
1667 inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
1668 {
1669  return v_int8x32(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
1670  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
1671  *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
1672  *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7])));
1673 }
1674 inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
1675 inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
1676 inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
1677 
1678 inline v_int16x16 v256_lut(const short* tab, const int* idx)
1679 {
1680  return v_int16x16(_v256_setr_h(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]],
1681  tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]],
1682  tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]],
1683  tab[idx[15]]));
1684 }
1685 inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
1686 {
1687  return v_int16x16(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
1688  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
1689  *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
1690  *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) ));
1691 }
1692 inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
1693 {
1694  return v_int16x16(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
1695  *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
1696 
1697 }
1698 inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
1699 inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
1700 inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
1701 
1702 inline v_int32x8 v256_lut(const int* tab, const int* idx)
1703 {
1704  return v_int32x8(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
1705  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
1706  *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
1707  *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) ));
1708 }
1709 inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
1710 {
1711  return v_int32x8(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
1712  *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
1713 }
1714 inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
1715 {
1716  return v_int32x8(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0)));
1717 }
1718 inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
1719 inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
1720 inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
1721 
1722 inline v_int64x4 v256_lut(const int64* tab, const int* idx)
1723 {
1724  return v_int64x4(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
1725  *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
1726 }
1727 inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
1728 {
1729  return v_int64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0)));
1730 }
1731 inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
1732 inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
1733 
1734 inline v_float32x8 v256_lut(const float* tab, const int* idx)
1735 {
1736  return v_float32x8(_v256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1737  tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
1738 }
1739 inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
1740 inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
1741 
1742 inline v_float64x4 v256_lut(const double* tab, const int* idx)
1743 {
1744  return v_float64x4(_v256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
1745 }
1746 inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx)
1747 { return v_float64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0))); }
1748 
1749 inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
1750 {
1751  int *idx = (int*)&idxvec.val;
1752  return v256_lut(tab, idx);
1753 }
1754 
1755 inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
1756 {
1757  return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
1758 }
1759 
1760 inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
1761 {
1762  const int *idx = (const int*)&idxvec.val;
1763  return v256_lut(tab, idx);
1764 }
1765 
1766 inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
1767 {
1768  const int *idx = (const int*)&idxvec.val;
1769  return v256_lut(tab, idx);
1770 }
1771 
1772 inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
1773 {
1774  const int *idx = (const int*)&idxvec.val;
1775  __m128i xy01, xy45, xy23, xy67;
1776  xy01 = __lsx_vld(tab + idx[0], 0);
1777  xy01 = __lsx_vextrins_d(xy01, __lsx_vld(tab + idx[1], 0), 0x10);
1778  xy45 = __lsx_vld(tab + idx[4], 0);
1779  xy45 = __lsx_vextrins_d(xy45, __lsx_vld(tab + idx[5], 0), 0x10);
1780  __m256i xy0145 = _v256_combine(xy01, xy45);
1781  xy23 = __lsx_vld(tab + idx[2], 0);
1782  xy23 = __lsx_vextrins_d(xy23, __lsx_vld(tab + idx[3], 0), 0x10);
1783  xy67 = __lsx_vld(tab + idx[6], 0);
1784  xy67 = __lsx_vextrins_d(xy67, __lsx_vld(tab + idx[7], 0), 0x10);
1785  __m256i xy2367 = _v256_combine(xy23, xy67);
1786 
1787  __m256i xxyy0145 = __lasx_xvilvl_w(xy2367, xy0145);
1788  __m256i xxyy2367 = __lasx_xvilvh_w(xy2367, xy0145);
1789 
1790  x = v_float32x8(__lasx_xvilvl_w(xxyy2367, xxyy0145));
1791  y = v_float32x8(__lasx_xvilvh_w(xxyy2367, xxyy0145));
1792 }
1793 
1794 inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
1795 {
1796  //int CV_DECL_ALIGNED(32) idx[4];
1797  const int *idx = (const int*)&idxvec.val;
1798  __m128i xy0 = __lsx_vld(tab + idx[0], 0);
1799  __m128i xy2 = __lsx_vld(tab + idx[2], 0);
1800  __m128i xy1 = __lsx_vld(tab + idx[1], 0);
1801  __m128i xy3 = __lsx_vld(tab + idx[3], 0);
1802  __m256i xy02 = _v256_combine(xy0, xy2);
1803  __m256i xy13 = _v256_combine(xy1, xy3);
1804 
1805  x = v_float64x4(__lasx_xvilvl_d(xy13, xy02));
1806  y = v_float64x4(__lasx_xvilvh_d(xy13, xy02));
1807 }
1808 
1809 inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
1810 {
1811  return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
1812  _v256_set_d(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
1813 }
1814 inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec)
1815 { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1816 inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
1817 {
1818  return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
1819  _v256_set_d(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
1820 }
1821 inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec)
1822 { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1823 
1824 inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
1825 {
1826  return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
1827  _v256_set_d(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
1828 }
1829 inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec)
1830 { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1831 inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
1832 {
1833  return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
1834  _v256_set_d(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
1835 }
1836 inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec)
1837 { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1838 
1839 inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
1840 {
1841  return v_int32x8(__lasx_xvshuf4i_w(vec.val, 0xd8));
1842 }
1843 inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec)
1844 { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1845 inline v_float32x8 v_interleave_pairs(const v_float32x8& vec)
1846 { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1847 
1848 inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
1849 {
1850  __m256i vzero = __lasx_xvreplgr2vr_w(0);
1851  __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
1852  _v256_set_d(0x1211100f0e0d0c0a, 0x0908060504020100, 0x1211100f0e0d0c0a, 0x0908060504020100));
1853  return v_int8x32(__lasx_xvperm_w(t1,
1854  _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1855 }
1856 inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
1857 { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1858 
1859 inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
1860 {
1861  __m256i vzero = __lasx_xvreplgr2vr_w(0);
1862  __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
1863  _v256_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100, 0x11100f0e0d0c0b0a, 0x0908050403020100));
1864  return v_int16x16(__lasx_xvperm_w(t1,
1865  _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1866 }
1867 inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec)
1868 { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1869 
1870 inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
1871 {
1872  return v_int32x8(__lasx_xvperm_w(vec.val,
1873  _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1874 }
1875 inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec)
1876 { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
1877 inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
1878 {
1879  return v_float32x8(__lasx_xvperm_w(*(__m256i*)(&vec.val),
1880  _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
1881 }
1882 
1884 
1886 
1887 // 16 >> 32
1888 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
1889 { return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
1890 
1891 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1892 { return v_dotprod(a, b) + c; }
1893 
1894 // 32 >> 64
1895 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
1896 {
1897  __m256i even = __lasx_xvmulwev_d_w(a.val, b.val);
1898  return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
1899 }
1900 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
1901 {
1902  __m256i even = __lasx_xvmaddwev_d_w(c.val, a.val, b.val);
1903  return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
1904 }
1905 
1906 // 8 >> 32
1907 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
1908 {
1909  __m256i even = __lasx_xvmulwev_h_bu(a.val, b.val);
1910  __m256i odd = __lasx_xvmulwod_h_bu(a.val, b.val);
1911  __m256i prod0 = __lasx_xvhaddw_wu_hu(even, even);
1912  __m256i prod1 = __lasx_xvhaddw_wu_hu(odd, odd);
1913  return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
1914 }
1915 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
1916 { return v_dotprod_expand(a, b) + c; }
1917 
1918 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
1919 {
1920  __m256i even = __lasx_xvmulwev_h_b(a.val, b.val);
1921  __m256i odd = __lasx_xvmulwod_h_b(a.val, b.val);
1922  __m256i prod0 = __lasx_xvhaddw_w_h(even, even);
1923  __m256i prod1 = __lasx_xvhaddw_w_h(odd, odd);
1924  return v_int32x8(__lasx_xvadd_w(prod0, prod1));
1925 }
1926 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
1927 { return v_dotprod_expand(a, b) + c; }
1928 
1929 // 16 >> 64
1930 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
1931 {
1932  __m256i even = __lasx_xvmulwev_w_hu(a.val, b.val);
1933  __m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val);
1934  __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
1935  __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
1936  return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
1937 }
1938 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
1939 { return v_dotprod_expand(a, b) + c; }
1940 
1941 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
1942 {
1943  __m256i even = __lasx_xvmulwev_w_h(a.val, b.val);
1944  __m256i odd = __lasx_xvmulwod_w_h(a.val, b.val);
1945  __m256i prod0 = __lasx_xvhaddw_d_w(even, even);
1946  __m256i prod1 = __lasx_xvhaddw_d_w(odd, odd);
1947  return v_int64x4(__lasx_xvadd_d(prod0, prod1));
1948 }
1949 
1950 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
1951 { return v_dotprod_expand(a, b) + c; }
1952 
1953 // 32 >> 64f
1954 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
1955 { return v_cvt_f64(v_dotprod(a, b)); }
1956 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
1957 { return v_dotprod_expand(a, b) + c; }
1958 
1960 
1961 // 16 >> 32
1962 inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
1963 { return v_dotprod(a, b); }
1964 inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1965 { return v_dotprod(a, b, c); }
1966 
1967 // 32 >> 64
1968 inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
1969 { return v_dotprod(a, b); }
1970 inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
1971 { return v_dotprod(a, b, c); }
1972 
1973 // 8 >> 32
1974 inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
1975 { return v_dotprod_expand(a, b); }
1976 inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
1977 { return v_dotprod_expand(a, b, c); }
1978 
1979 inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
1980 { return v_dotprod_expand(a, b); }
1981 inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
1982 { return v_dotprod_expand(a, b, c); }
1983 
1984 // 16 >> 64
1985 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
1986 {
1987  __m256i even = __lasx_xvmulwev_w_hu(a.val, b.val);
1988  __m256i odd = __lasx_xvmulwod_w_hu(a.val, b.val);
1989  __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
1990  __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
1991  return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
1992 }
1993 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
1994 { return v_dotprod_expand_fast(a, b) + c; }
1995 
1996 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
1997 {
1998  __m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
1999  __m256i sign = __lasx_xvsrai_w(prod, 31);
2000  __m256i lo = __lasx_xvilvl_w(sign, prod);
2001  __m256i hi = __lasx_xvilvh_w(sign, prod);
2002  return v_int64x4(__lasx_xvadd_d(lo, hi));
2003 }
2004 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
2005 { return v_dotprod_expand_fast(a, b) + c; }
2006 
2007 // 32 >> 64f
2008 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
2009 { return v_dotprod_expand(a, b); }
2010 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
2011 { return v_dotprod_expand(a, b, c); }
2012 
2013 
2014 #define OPENCV_HAL_LASX_SPLAT2_PS(a, im) \
2015  v_float32x8(__lasx_xvpermi_w(a.val, a.val, im))
2016 
2017 inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
2018  const v_float32x8& m1, const v_float32x8& m2,
2019  const v_float32x8& m3)
2020 {
2021  v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
2022  v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
2023  v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
2024  v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
2025  return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
2026 }
2027 
2028 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
2029  const v_float32x8& m1, const v_float32x8& m2,
2030  const v_float32x8& a)
2031 {
2032  v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
2033  v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
2034  v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
2035  return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
2036 }
2037 
2038 
2039 #define OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(_Tpvec, cast_from, cast_to) \
2040  inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2041  const _Tpvec& a2, const _Tpvec& a3, \
2042  _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
2043  { \
2044  __m256i t0 = cast_from(__lasx_xvilvl_w(a1.val, a0.val)); \
2045  __m256i t1 = cast_from(__lasx_xvilvl_w(a3.val, a2.val)); \
2046  __m256i t2 = cast_from(__lasx_xvilvh_w(a1.val, a0.val)); \
2047  __m256i t3 = cast_from(__lasx_xvilvh_w(a3.val, a2.val)); \
2048  b0.val = cast_to(__lasx_xvilvl_d(t1, t0)); \
2049  b1.val = cast_to(__lasx_xvilvh_d(t1, t0)); \
2050  b2.val = cast_to(__lasx_xvilvl_d(t3, t2)); \
2051  b3.val = cast_to(__lasx_xvilvh_d(t3, t2)); \
2052  }
2053 
2054 OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_uint32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2055 OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_int32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2056 
2057 inline void v_transpose4x4(const v_float32x8 &a0, const v_float32x8 &a1,
2058  const v_float32x8 &a2, const v_float32x8 &a3,
2059  v_float32x8 &b0, v_float32x8 &b1, v_float32x8 &b2, v_float32x8 &b3)
2060 {
2061  __m256i t0 = __lasx_xvilvl_w(__m256i(a1.val), __m256i(a0.val));
2062  __m256i t1 = __lasx_xvilvl_w(__m256i(a3.val), __m256i(a2.val));
2063  __m256i t2 = __lasx_xvilvh_w(__m256i(a1.val), __m256i(a0.val));
2064  __m256i t3 = __lasx_xvilvh_w(__m256i(a3.val), __m256i(a2.val));
2065  b0.val = __m256(__lasx_xvilvl_d(t1, t0));
2066  b1.val = __m256(__lasx_xvilvh_d(t1, t0));
2067  b2.val = __m256(__lasx_xvilvl_d(t3, t2));
2068  b3.val = __m256(__lasx_xvilvh_d(t3, t2));
2069 }
2070 
2072 
2073 /* Expand */
2074 #define OPENCV_HAL_IMPL_LASX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
2075  inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2076  { \
2077  b0.val = intrin(a.val); \
2078  b1.val = intrin(__lasx_xvpermi_q(a.val, a.val, 0x11)); \
2079  } \
2080  inline _Tpwvec v_expand_low(const _Tpvec& a) \
2081  { return _Tpwvec(intrin(a.val)); } \
2082  inline _Tpwvec v_expand_high(const _Tpvec& a) \
2083  { return _Tpwvec(intrin(__lasx_xvpermi_q(a.val, a.val, 0x11))); } \
2084  inline _Tpwvec v256_load_expand(const _Tp* ptr) \
2085  { \
2086  __m128i a = __lsx_vld(ptr, 0); \
2087  return _Tpwvec(intrin(*((__m256i*)&a))); \
2088  }
2089 
2090 OPENCV_HAL_IMPL_LASX_EXPAND(v_uint8x32, v_uint16x16, uchar, __lasx_vext2xv_hu_bu)
2091 OPENCV_HAL_IMPL_LASX_EXPAND(v_int8x32, v_int16x16, schar, __lasx_vext2xv_h_b)
2092 OPENCV_HAL_IMPL_LASX_EXPAND(v_uint16x16, v_uint32x8, ushort, __lasx_vext2xv_wu_hu)
2093 OPENCV_HAL_IMPL_LASX_EXPAND(v_int16x16, v_int32x8, short, __lasx_vext2xv_w_h)
2094 OPENCV_HAL_IMPL_LASX_EXPAND(v_uint32x8, v_uint64x4, unsigned, __lasx_vext2xv_du_wu)
2095 OPENCV_HAL_IMPL_LASX_EXPAND(v_int32x8, v_int64x4, int, __lasx_vext2xv_d_w)
2096 
2097 #define OPENCV_HAL_IMPL_LASX_EXPAND_Q(_Tpvec, _Tp, intrin) \
2098  inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
2099  { \
2100  __m128i a = __lsx_vld(ptr, 0); \
2101  return _Tpvec(intrin(*((__m256i*)&a))); \
2102  }
2103 
2104 OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_uint32x8, uchar, __lasx_vext2xv_wu_bu)
2105 OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_int32x8, schar, __lasx_vext2xv_w_b)
2106 
2107 /* pack */
2108 // 16
2109 inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
2110 { return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); }
2111 
2112 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
2113 { return v_uint8x32(_v256_shuffle_odd_64(__lasx_xvssrlrni_bu_h(b.val, a.val, 0))); }
2114 
2115 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
2116 {
2117  return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a.val, b.val)));
2118 }
2119 
2120 inline void v_pack_store(schar* ptr, const v_int16x16& a)
2121 { v_store_low(ptr, v_pack(a, a)); }
2122 
2123 inline void v_pack_store(uchar *ptr, const v_uint16x16& a)
2124 { v_store_low(ptr, v_pack(a, a)); }
2125 
2126 inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
2127 { v_store_low(ptr, v_pack_u(a, a)); }
2128 
2129 template<int n> inline
2130 v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
2131 {
2132  __m256i res = __lasx_xvssrlrni_bu_h(b.val, a.val, n);
2133  return v_uint8x32(_v256_shuffle_odd_64(res));
2134 }
2135 
2136 template<int n> inline
2137 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
2138 {
2139  __m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
2140  __lasx_xvstelm_d(res, ptr, 0, 0);
2141  __lasx_xvstelm_d(res, ptr, 8, 2);
2142 }
2143 
2144 template<int n> inline
2145 v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
2146 {
2147  __m256i res = __lasx_xvssrarni_bu_h(b.val, a.val, n);
2148  return v_uint8x32(_v256_shuffle_odd_64(res));
2149 }
2150 
2151 template<int n> inline
2152 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
2153 {
2154  __m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
2155  __lasx_xvstelm_d(res, ptr, 0, 0);
2156  __lasx_xvstelm_d(res, ptr, 8, 2);
2157 }
2158 
2159 template<int n> inline
2160 v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
2161 {
2162  __m256i res = __lasx_xvssrarni_b_h(b.val, a.val, n);
2163  return v_int8x32(_v256_shuffle_odd_64(res));
2164 }
2165 
2166 template<int n> inline
2167 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
2168 {
2169  __m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
2170  __lasx_xvstelm_d(res, ptr, 0, 0);
2171  __lasx_xvstelm_d(res, ptr, 8, 2);
2172 }
2173 
2174 // 32
2175 inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
2176 { return v_int16x16(_v256_shuffle_odd_64(_lasx_packs_w(a.val, b.val))); }
2177 
2178 inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
2179 { return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
2180 
2181 inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
2182 { return v_uint16x16(_v256_shuffle_odd_64(_lasx_packus_w(a.val, b.val))); }
2183 
2184 inline void v_pack_store(short* ptr, const v_int32x8& a)
2185 { v_store_low(ptr, v_pack(a, a)); }
2186 
2187 inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
2188 {
2189  __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
2190  __lasx_xvstelm_d(res, ptr, 0, 0);
2191  __lasx_xvstelm_d(res, ptr, 8, 2);
2192 }
2193 
2194 inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
2195 { v_store_low(ptr, v_pack_u(a, a)); }
2196 
2197 template<int n> inline
2198 v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
2199 { return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrlrni_hu_w(b.val, a.val, n))); }
2200 
2201 template<int n> inline
2202 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
2203 {
2204  __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
2205  __lasx_xvstelm_d(res, ptr, 0, 0);
2206  __lasx_xvstelm_d(res, ptr, 8, 2);
2207 }
2208 
2209 template<int n> inline
2210 v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
2211 { return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_hu_w(b.val, a.val, n))); }
2212 
2213 template<int n> inline
2214 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
2215 {
2216  __m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
2217  __lasx_xvstelm_d(res, ptr, 0, 0);
2218  __lasx_xvstelm_d(res, ptr, 8, 2);
2219 }
2220 
2221 template<int n> inline
2222 v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
2223 { return v_int16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_h_w(b.val, a.val, n))); }
2224 
2225 template<int n> inline
2226 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
2227 {
2228  __m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
2229  __lasx_xvstelm_d(res, ptr, 0, 0);
2230  __lasx_xvstelm_d(res, ptr, 8, 2);
2231 }
2232 
2233 // 64
2234 // Non-saturating pack
2235 inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
2236 {
2237  __m256i ab = __lasx_xvpickev_w(b.val, a.val);
2238  return v_uint32x8(_v256_shuffle_odd_64(ab));
2239 }
2240 
2241 inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
2242 { return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
2243 
2244 inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
2245 {
2246  __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
2247  v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
2248 }
2249 
2250 inline void v_pack_store(int* ptr, const v_int64x4& b)
2251 { v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
2252 
2253 template<int n> inline
2254 v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
2255 { return v_uint32x8(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(b.val, a.val, n))); }
2256 
2257 template<int n> inline
2258 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
2259 {
2260  __m256i res = __lasx_xvsrlrni_w_d(a.val, a.val, n);
2261  __lasx_xvstelm_d(res, ptr, 0, 0);
2262  __lasx_xvstelm_d(res, ptr, 8, 2);
2263 }
2264 
2265 template<int n> inline
2266 v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
2267 { return v_int32x8(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(b.val, a.val, n))); }
2268 
2269 template<int n> inline
2270 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
2271 {
2272  __m256i res = __lasx_xvsrarni_w_d(a.val, a.val, n);
2273  __lasx_xvstelm_d(res, ptr, 0, 0);
2274  __lasx_xvstelm_d(res, ptr, 8, 2);
2275 }
2276 
2277 // pack boolean
2278 inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
2279 {
2280  __m256i ab = _lasx_packs_h(a.val, b.val);
2281  return v_uint8x32(_v256_shuffle_odd_64(ab));
2282 }
2283 
2284 inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
2285  const v_uint32x8& c, const v_uint32x8& d)
2286 {
2287  __m256i ab = _lasx_packs_w(a.val, b.val);
2288  __m256i cd = _lasx_packs_w(c.val, d.val);
2289 
2290  __m256i abcd = _v256_shuffle_odd_64(_lasx_packs_h(ab, cd));
2291  return v_uint8x32(__lasx_xvshuf4i_w(abcd, 0xd8));
2292 }
2293 
2294 inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
2295  const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
2296  const v_uint64x4& g, const v_uint64x4& h)
2297 {
2298  __m256i ab = _lasx_packs_w(a.val, b.val);
2299  __m256i cd = _lasx_packs_w(c.val, d.val);
2300  __m256i ef = _lasx_packs_w(e.val, f.val);
2301  __m256i gh = _lasx_packs_w(g.val, h.val);
2302 
2303  __m256i abcd = _lasx_packs_w(ab, cd);
2304  __m256i efgh = _lasx_packs_w(ef, gh);
2305  __m256i pkall = _v256_shuffle_odd_64(_lasx_packs_h(abcd, efgh));
2306 
2307  __m256i rev = _v256_alignr_b(pkall, pkall, 8);
2308  return v_uint8x32(__lasx_xvilvl_h(rev, pkall));
2309 }
2310 
2311 /* Recombine */
2312 // its up there with load and store operations
2313 
2314 /* Extract */
2315 #define OPENCV_HAL_IMPL_LASX_EXTRACT(_Tpvec) \
2316  template<int s> \
2317  inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2318  { return v_rotate_right<s>(a, b); }
2319 
2320 OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint8x32)
2321 OPENCV_HAL_IMPL_LASX_EXTRACT(v_int8x32)
2322 OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint16x16)
2323 OPENCV_HAL_IMPL_LASX_EXTRACT(v_int16x16)
2324 OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint32x8)
2325 OPENCV_HAL_IMPL_LASX_EXTRACT(v_int32x8)
2326 OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint64x4)
2327 OPENCV_HAL_IMPL_LASX_EXTRACT(v_int64x4)
2328 OPENCV_HAL_IMPL_LASX_EXTRACT(v_float32x8)
2329 OPENCV_HAL_IMPL_LASX_EXTRACT(v_float64x4)
2330 
2331 template<int i>
2332 inline uchar v_extract_n(v_uint8x32 a)
2333 {
2334  return (uchar)_v256_extract_b<i>(a.val);
2335 }
2336 
2337 template<int i>
2338 inline schar v_extract_n(v_int8x32 a)
2339 {
2340  return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
2341 }
2342 
2343 template<int i>
2344 inline ushort v_extract_n(v_uint16x16 a)
2345 {
2346  return (ushort)_v256_extract_h<i>(a.val);
2347 }
2348 
2349 template<int i>
2350 inline short v_extract_n(v_int16x16 a)
2351 {
2352  return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
2353 }
2354 
2355 template<int i>
2356 inline uint v_extract_n(v_uint32x8 a)
2357 {
2358  return (uint)_v256_extract_w<i>(a.val);
2359 }
2360 
2361 template<int i>
2362 inline int v_extract_n(v_int32x8 a)
2363 {
2364  return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
2365 }
2366 
2367 template<int i>
2368 inline uint64 v_extract_n(v_uint64x4 a)
2369 {
2370  return (uint64)_v256_extract_d<i>(a.val);
2371 }
2372 
2373 template<int i>
2374 inline int64 v_extract_n(v_int64x4 v)
2375 {
2376  return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
2377 }
2378 
2379 template<int i>
2380 inline float v_extract_n(v_float32x8 v)
2381 {
2382  union { uint iv; float fv; } d;
2383  d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
2384  return d.fv;
2385 }
2386 
2387 template<int i>
2388 inline double v_extract_n(v_float64x4 v)
2389 {
2390  union { uint64 iv; double dv; } d;
2391  d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
2392  return d.dv;
2393 }
2394 
2395 template<int i>
2396 inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
2397 {
2398  static const __m256i perm = __lasx_xvreplgr2vr_w((char)i);
2399  return v_uint32x8(__lasx_xvperm_w(a.val, perm));
2400 }
2401 
2402 template<int i>
2403 inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
2404 { return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2405 
2406 template<int i>
2407 inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
2408 { return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
2409 
2411 
2412 inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b)
2413 {
2414  __m256i t0 = __lasx_xvld(ptr, 0);
2415  __m256i t1 = __lasx_xvld(ptr, 32);
2416 
2417  __m256i p0 = __lasx_xvpickev_b(t1, t0);
2418  __m256i p1 = __lasx_xvpickod_b(t1, t0);
2419 
2420  a.val = __lasx_xvpermi_d(p0, 0xd8);
2421  b.val = __lasx_xvpermi_d(p1, 0xd8);
2422 }
2423 
2424 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
2425 {
2426  __m256i t0 = __lasx_xvld(ptr, 0);
2427  __m256i t1 = __lasx_xvld(ptr, 32);
2428 
2429  __m256i p0 = __lasx_xvpickev_h(t1, t0);
2430  __m256i p1 = __lasx_xvpickod_h(t1, t0);
2431 
2432  a.val = __lasx_xvpermi_d(p0, 0xd8);
2433  b.val = __lasx_xvpermi_d(p1, 0xd8);
2434 }
2435 
2436 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
2437 {
2438  __m256i t0 = __lasx_xvld(ptr, 0);
2439  __m256i t1 = __lasx_xvld(ptr, 32);
2440 
2441  __m256i p0 = __lasx_xvpickev_w(t1, t0);
2442  __m256i p1 = __lasx_xvpickod_w(t1, t0);
2443 
2444  a.val = __lasx_xvpermi_d(p0, 0xd8);
2445  b.val = __lasx_xvpermi_d(p1, 0xd8);
2446 }
2447 
2448 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
2449 {
2450  __m256i ab0 = __lasx_xvld(ptr, 0);
2451  __m256i ab1 = __lasx_xvld(ptr, 32);
2452 
2453  __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02);
2454  __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13);
2455  __m256i a0 = __lasx_xvilvl_d(ph, pl);
2456  __m256i b0 = __lasx_xvilvh_d(ph, pl);
2457  a = v_uint64x4(a0);
2458  b = v_uint64x4(b0);
2459 }
2460 
2461 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
2462 {
2463  __m256i bgr0 = __lasx_xvld(ptr, 0);
2464  __m256i bgr1 = __lasx_xvld(ptr, 32);
2465  __m256i bgr2 = __lasx_xvld(ptr, 64);
2466 
2467  __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2468  __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2469 
2470  const __m256i m0 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2471  0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2472  const __m256i m1 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2473  -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
2474 
2475  __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
2476  __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
2477  __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
2478 
2479  const __m256i
2480  sh_b = _v256_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
2481  0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
2482  sh_g = _v256_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
2483  1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
2484  sh_r = _v256_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
2485  2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2486  b0 = __lasx_xvshuf_b(b0, b0, sh_b);
2487  g0 = __lasx_xvshuf_b(g0, g0, sh_g);
2488  r0 = __lasx_xvshuf_b(r0, r0, sh_r);
2489 
2490  a = v_uint8x32(b0);
2491  b = v_uint8x32(g0);
2492  c = v_uint8x32(r0);
2493 }
2494 
2495 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
2496 {
2497  __m256i bgr0 = __lasx_xvld(ptr, 0);
2498  __m256i bgr1 = __lasx_xvld(ptr, 32);
2499  __m256i bgr2 = __lasx_xvld(ptr, 64);
2500 
2501  __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2502  __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2503 
2504  const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2505  0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2506  const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2507  -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2508  __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
2509  __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
2510  __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
2511  const __m256i sh_b = _v256_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2512  0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2513  const __m256i sh_g = _v256_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2514  2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2515  const __m256i sh_r = _v256_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2516  4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2517  b0 = __lasx_xvshuf_b(b0, b0, sh_b);
2518  g0 = __lasx_xvshuf_b(g0, g0, sh_g);
2519  r0 = __lasx_xvshuf_b(r0, r0, sh_r);
2520 
2521  a = v_uint16x16(b0);
2522  b = v_uint16x16(g0);
2523  c = v_uint16x16(r0);
2524 }
2525 
2526 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
2527 {
2528  __m256i bgr0 = __lasx_xvld(ptr, 0);
2529  __m256i bgr1 = __lasx_xvld(ptr, 32);
2530  __m256i bgr2 = __lasx_xvld(ptr, 64);
2531 
2532  __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
2533  __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
2534 
2535  __m256i m24 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
2536  __m256i m92 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
2537  __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m24), bgr1, m92);
2538  __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m92), bgr1, m24);
2539  __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m24), s02_high, m92);
2540 
2541  b0 = __lasx_xvshuf4i_w(b0, 0x6c);
2542  g0 = __lasx_xvshuf4i_w(g0, 0xb1);
2543  r0 = __lasx_xvshuf4i_w(r0, 0xc6);
2544 
2545  a = v_uint32x8(b0);
2546  b = v_uint32x8(g0);
2547  c = v_uint32x8(r0);
2548 }
2549 
2550 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
2551 {
2552  __m256i bgr0 = __lasx_xvld(ptr, 0);
2553  __m256i bgr1 = __lasx_xvld(ptr, 32);
2554  __m256i bgr2 = __lasx_xvld(ptr, 64);
2555 
2556  __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12); // get bgr0 low 128 and bgr1 high 128
2557  __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12);
2558  __m256i s20r = __lasx_xvpermi_d(__lasx_xvpermi_q(bgr2, bgr0, 0x12), 0x1b);
2559  __m256i b0 = __lasx_xvilvl_d(s20r, s01);
2560  __m256i g0 = _v256_alignr_b(s12, s01, 8);
2561  __m256i r0 = __lasx_xvilvh_d(s12, s20r);
2562 
2563  a = v_uint64x4(b0);
2564  b = v_uint64x4(g0);
2565  c = v_uint64x4(r0);
2566 }
2567 
2568 inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d)
2569 {
2570  __m256i t0 = __lasx_xvld(ptr, 0);
2571  __m256i t1 = __lasx_xvld(ptr, 32);
2572  __m256i t2 = __lasx_xvld(ptr, 64);
2573  __m256i t3 = __lasx_xvld(ptr, 96);
2574 
2575  const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
2576  __m256i ac_lo = __lasx_xvpickev_b(t1, t0);
2577  __m256i bd_lo = __lasx_xvpickod_b(t1, t0);
2578  __m256i ac_hi = __lasx_xvpickev_b(t3, t2);
2579  __m256i bd_hi = __lasx_xvpickod_b(t3, t2);
2580 
2581  __m256i a_pre = __lasx_xvpickev_b(ac_hi, ac_lo);
2582  __m256i c_pre = __lasx_xvpickod_b(ac_hi, ac_lo);
2583  __m256i b_pre = __lasx_xvpickev_b(bd_hi, bd_lo);
2584  __m256i d_pre = __lasx_xvpickod_b(bd_hi, bd_lo);
2585 
2586  a.val = __lasx_xvperm_w(a_pre, sh);
2587  b.val = __lasx_xvperm_w(b_pre, sh);
2588  c.val = __lasx_xvperm_w(c_pre, sh);
2589  d.val = __lasx_xvperm_w(d_pre, sh);
2590 }
2591 
2592 inline void v_load_deinterleave(const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d)
2593 {
2594  __m256i t0 = __lasx_xvld(ptr, 0);
2595  __m256i t1 = __lasx_xvld(ptr, 32);
2596  __m256i t2 = __lasx_xvld(ptr, 64);
2597  __m256i t3 = __lasx_xvld(ptr, 96);
2598 
2599  const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
2600  __m256i ac_lo = __lasx_xvpickev_h(t1, t0);
2601  __m256i bd_lo = __lasx_xvpickod_h(t1, t0);
2602  __m256i ac_hi = __lasx_xvpickev_h(t3, t2);
2603  __m256i bd_hi = __lasx_xvpickod_h(t3, t2);
2604 
2605  __m256i a_pre = __lasx_xvpickev_h(ac_hi, ac_lo);
2606  __m256i c_pre = __lasx_xvpickod_h(ac_hi, ac_lo);
2607  __m256i b_pre = __lasx_xvpickev_h(bd_hi, bd_lo);
2608  __m256i d_pre = __lasx_xvpickod_h(bd_hi, bd_lo);
2609 
2610  a.val = __lasx_xvperm_w(a_pre, sh);
2611  b.val = __lasx_xvperm_w(b_pre, sh);
2612  c.val = __lasx_xvperm_w(c_pre, sh);
2613  d.val = __lasx_xvperm_w(d_pre, sh);
2614 }
2615 
2616 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
2617 {
2618  __m256i p0 = __lasx_xvld(ptr, 0);
2619  __m256i p1 = __lasx_xvld(ptr, 32);
2620  __m256i p2 = __lasx_xvld(ptr, 64);
2621  __m256i p3 = __lasx_xvld(ptr, 96);
2622 
2623  __m256i p01l = __lasx_xvilvl_w(p1, p0);
2624  __m256i p01h = __lasx_xvilvh_w(p1, p0);
2625  __m256i p23l = __lasx_xvilvl_w(p3, p2);
2626  __m256i p23h = __lasx_xvilvh_w(p3, p2);
2627 
2628  __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
2629  __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
2630  __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
2631  __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
2632 
2633  __m256i b0 = __lasx_xvilvl_w(plh, pll);
2634  __m256i g0 = __lasx_xvilvh_w(plh, pll);
2635  __m256i r0 = __lasx_xvilvl_w(phh, phl);
2636  __m256i a0 = __lasx_xvilvh_w(phh, phl);
2637 
2638  a = v_uint32x8(b0);
2639  b = v_uint32x8(g0);
2640  c = v_uint32x8(r0);
2641  d = v_uint32x8(a0);
2642 }
2643 
2644 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
2645 {
2646  __m256i bgra0 = __lasx_xvld(ptr, 0);
2647  __m256i bgra1 = __lasx_xvld(ptr, 32);
2648  __m256i bgra2 = __lasx_xvld(ptr, 64);
2649  __m256i bgra3 = __lasx_xvld(ptr, 96);
2650 
2651  __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02);
2652  __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13);
2653  __m256i l13 = __lasx_xvpermi_q(bgra1, bgra3, 0x02);
2654  __m256i h13 = __lasx_xvpermi_q(bgra1, bgra3, 0x13);
2655 
2656  __m256i b0 = __lasx_xvilvl_d(l13, l02);
2657  __m256i g0 = __lasx_xvilvh_d(l13, l02);
2658  __m256i r0 = __lasx_xvilvl_d(h13, h02);
2659  __m256i a0 = __lasx_xvilvh_d(h13, h02);
2660 
2661  a = v_uint64x4(b0);
2662  b = v_uint64x4(g0);
2663  c = v_uint64x4(r0);
2664  d = v_uint64x4(a0);
2665 }
2666 
2668 
2669 inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
2671 {
2672  __m256i xy_l = __lasx_xvilvl_b(y.val, x.val);
2673  __m256i xy_h = __lasx_xvilvh_b(y.val, x.val);
2674 
2675  __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2676  __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2677 
2678  __lasx_xvst(xy0, (__m256i*)ptr, 0);
2679  __lasx_xvst(xy1, (__m256i*)ptr, 32*1);
2680 }
2681 
2682 inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
2684 {
2685  __m256i xy_l = __lasx_xvilvl_h(y.val, x.val);
2686  __m256i xy_h = __lasx_xvilvh_h(y.val, x.val);
2687 
2688  __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2689  __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2690 
2691  __lasx_xvst(xy0, (__m256i*)ptr, 0);
2692  __lasx_xvst(xy1, (__m256i*)ptr, 16*2);
2693 }
2694 
2695 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
2697 {
2698  __m256i xy_l = __lasx_xvilvl_w(y.val, x.val);
2699  __m256i xy_h = __lasx_xvilvh_w(y.val, x.val);
2700 
2701  __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2702  __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2703 
2704  __lasx_xvst(xy0, (__m256i*)ptr, 0);
2705  __lasx_xvst(xy1, (__m256i*)ptr, 8*4);
2706 }
2707 
2708 inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
2710 {
2711  __m256i xy_l = __lasx_xvilvl_d(y.val, x.val);
2712  __m256i xy_h = __lasx_xvilvh_d(y.val, x.val);
2713 
2714  __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
2715  __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
2716 
2717  __lasx_xvst(xy0, (__m256i*)ptr, 0);
2718  __lasx_xvst(xy1, (__m256i*)ptr, 4*8);
2719 }
2720 
2721 inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
2723 {
2724  const __m256i sh_b = _v256_setr_b(
2725  0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2726  0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2727  const __m256i sh_g = _v256_setr_b(
2728  5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2729  5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2730  const __m256i sh_r = _v256_setr_b(
2731  10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2732  10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2733 
2734  __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
2735  __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
2736  __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
2737 
2738  const __m256i m0 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2739  0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2740  const __m256i m1 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2741  0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2742 
2743  __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
2744  __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
2745  __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
2746 
2747  __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
2748  __m256i bgr1 = __lasx_xvpermi_q(p0, p2, 0 + 3*16);
2749  __m256i bgr2 = __lasx_xvpermi_q(p2, p1, 1 + 3*16);
2750 
2751  __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2752  __lasx_xvst(bgr1, (__m256i*)ptr, 32);
2753  __lasx_xvst(bgr2, (__m256i*)ptr, 64);
2754 }
2755 
2756 inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
2758 {
2759  const __m256i sh_b = _v256_setr_b(
2760  0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2761  0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2762  const __m256i sh_g = _v256_setr_b(
2763  10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2764  10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2765  const __m256i sh_r = _v256_setr_b(
2766  4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2767  4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2768 
2769  __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
2770  __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
2771  __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
2772 
2773  const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2774  0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2775  const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2776  -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2777 
2778  __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
2779  __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
2780  __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
2781 
2782  __m256i bgr0 = __lasx_xvpermi_q(p2, p0, 0 + 2*16);
2783  __m256i bgr2 = __lasx_xvpermi_q(p2, p0, 1 + 3*16);
2784 
2785  __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2786  __lasx_xvst(p1, (__m256i*)ptr, 16*2);
2787  __lasx_xvst(bgr2, (__m256i*)ptr, 32*2);
2788 }
2789 
2790 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
2792 {
2793  __m256i b0 = __lasx_xvshuf4i_w(a.val, 0x6c);
2794  __m256i g0 = __lasx_xvshuf4i_w(b.val, 0xb1);
2795  __m256i r0 = __lasx_xvshuf4i_w(c.val, 0xc6);
2796 
2797  __m256i bitmask_1 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
2798  __m256i bitmask_2 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
2799 
2800  __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, bitmask_1), r0, bitmask_2);
2801  __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, bitmask_1), b0, bitmask_2);
2802  __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, bitmask_1), g0, bitmask_2);
2803 
2804  __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
2805  __m256i bgr2 = __lasx_xvpermi_q(p1, p0, 1 + 3*16);
2806 
2807  __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2808  __lasx_xvst(p2, (__m256i*)ptr, 8*4);
2809  __lasx_xvst(bgr2, (__m256i*)ptr, 16*4);
2810 }
2811 
2812 inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
2814 {
2815  __m256i s01 = __lasx_xvilvl_d(b.val, a.val);
2816  __m256i s12 = __lasx_xvilvh_d(c.val, b.val);
2817  __m256i s20 = __lasx_xvpermi_w(a.val, c.val, 0xe4);
2818 
2819  __m256i bgr0 = __lasx_xvpermi_q(s20, s01, 0 + 2*16);
2820  __m256i bgr1 = __lasx_xvpermi_q(s01, s12, 0x30);
2821  __m256i bgr2 = __lasx_xvpermi_q(s12, s20, 1 + 3*16);
2822 
2823  __lasx_xvst(bgr0, (__m256i*)ptr, 0);
2824  __lasx_xvst(bgr1, (__m256i*)ptr, 4*8);
2825  __lasx_xvst(bgr2, (__m256i*)ptr, 8*8);
2826 }
2827 
2828 inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
2829  const v_uint8x32& c, const v_uint8x32& d,
2831 {
2832  __m256i bg0 = __lasx_xvilvl_b(b.val, a.val);
2833  __m256i bg1 = __lasx_xvilvh_b(b.val, a.val);
2834  __m256i ra0 = __lasx_xvilvl_b(d.val, c.val);
2835  __m256i ra1 = __lasx_xvilvh_b(d.val, c.val);
2836 
2837  __m256i bgra0_ = __lasx_xvilvl_h(ra0, bg0);
2838  __m256i bgra1_ = __lasx_xvilvh_h(ra0, bg0);
2839  __m256i bgra2_ = __lasx_xvilvl_h(ra1, bg1);
2840  __m256i bgra3_ = __lasx_xvilvh_h(ra1, bg1);
2841 
2842  __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2843  __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2844  __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2845  __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2846 
2847  __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2848  __lasx_xvst(bgra1, (__m256i*)ptr, 32);
2849  __lasx_xvst(bgra2, (__m256i*)ptr, 64);
2850  __lasx_xvst(bgra3, (__m256i*)ptr, 96);
2851 }
2852 
2853 inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
2854  const v_uint16x16& c, const v_uint16x16& d,
2856 {
2857  __m256i bg0 = __lasx_xvilvl_h(b.val, a.val);
2858  __m256i bg1 = __lasx_xvilvh_h(b.val, a.val);
2859  __m256i ra0 = __lasx_xvilvl_h(d.val, c.val);
2860  __m256i ra1 = __lasx_xvilvh_h(d.val, c.val);
2861 
2862  __m256i bgra0_ = __lasx_xvilvl_w(ra0, bg0);
2863  __m256i bgra1_ = __lasx_xvilvh_w(ra0, bg0);
2864  __m256i bgra2_ = __lasx_xvilvl_w(ra1, bg1);
2865  __m256i bgra3_ = __lasx_xvilvh_w(ra1, bg1);
2866 
2867  __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2868  __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2869  __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2870  __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2871 
2872  __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2873  __lasx_xvst(bgra1, (__m256i*)ptr, 16*2);
2874  __lasx_xvst(bgra2, (__m256i*)ptr, 32*2);
2875  __lasx_xvst(bgra3, (__m256i*)ptr, 48*2);
2876 }
2877 
2878 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
2879  const v_uint32x8& c, const v_uint32x8& d,
2881 {
2882  __m256i bg0 = __lasx_xvilvl_w(b.val, a.val);
2883  __m256i bg1 = __lasx_xvilvh_w(b.val, a.val);
2884  __m256i ra0 = __lasx_xvilvl_w(d.val, c.val);
2885  __m256i ra1 = __lasx_xvilvh_w(d.val, c.val);
2886 
2887  __m256i bgra0_ = __lasx_xvilvl_d(ra0, bg0);
2888  __m256i bgra1_ = __lasx_xvilvh_d(ra0, bg0);
2889  __m256i bgra2_ = __lasx_xvilvl_d(ra1, bg1);
2890  __m256i bgra3_ = __lasx_xvilvh_d(ra1, bg1);
2891 
2892  __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
2893  __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
2894  __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
2895  __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
2896 
2897  __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2898  __lasx_xvst(bgra1, (__m256i*)ptr, 8*4);
2899  __lasx_xvst(bgra2, (__m256i*)ptr, 16*4);
2900  __lasx_xvst(bgra3, (__m256i*)ptr, 24*4);
2901 }
2902 
2903 inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
2904  const v_uint64x4& c, const v_uint64x4& d,
2906 {
2907  __m256i bg0 = __lasx_xvilvl_d(b.val, a.val);
2908  __m256i bg1 = __lasx_xvilvh_d(b.val, a.val);
2909  __m256i ra0 = __lasx_xvilvl_d(d.val, c.val);
2910  __m256i ra1 = __lasx_xvilvh_d(d.val, c.val);
2911 
2912  __m256i bgra0 = __lasx_xvpermi_q(ra0, bg0, 0 + 2*16);
2913  __m256i bgra1 = __lasx_xvpermi_q(ra1, bg1, 0 + 2*16);
2914  __m256i bgra2 = __lasx_xvpermi_q(ra0, bg0, 1 + 3*16);
2915  __m256i bgra3 = __lasx_xvpermi_q(ra1, bg1, 1 + 3*16);
2916 
2917  __lasx_xvst(bgra0, (__m256i*)ptr, 0);
2918  __lasx_xvst(bgra1, (__m256i*)(ptr), 4*8);
2919  __lasx_xvst(bgra2, (__m256i*)(ptr), 8*8);
2920  __lasx_xvst(bgra3, (__m256i*)(ptr), 12*8);
2921 }
2922 
2923 
2924 #define OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2925 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2926 { \
2927  _Tpvec1 a1, b1; \
2928  v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2929  a0 = v_reinterpret_as_##suffix0(a1); \
2930  b0 = v_reinterpret_as_##suffix0(b1); \
2931 } \
2932 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2933 { \
2934  _Tpvec1 a1, b1, c1; \
2935  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2936  a0 = v_reinterpret_as_##suffix0(a1); \
2937  b0 = v_reinterpret_as_##suffix0(b1); \
2938  c0 = v_reinterpret_as_##suffix0(c1); \
2939 } \
2940 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2941 { \
2942  _Tpvec1 a1, b1, c1, d1; \
2943  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2944  a0 = v_reinterpret_as_##suffix0(a1); \
2945  b0 = v_reinterpret_as_##suffix0(b1); \
2946  c0 = v_reinterpret_as_##suffix0(c1); \
2947  d0 = v_reinterpret_as_##suffix0(d1); \
2948 } \
2949 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2950  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2951 { \
2952  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2953  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2954  v_store_interleave((_Tp1*)ptr, a1, b1/*, mode*/); \
2955 } \
2956 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
2957  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2958 { \
2959  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2960  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2961  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2962  v_store_interleave((_Tp1*)ptr, a1, b1, c1/*, mode*/); \
2963 } \
2964 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2965  const _Tpvec0& c0, const _Tpvec0& d0, \
2966  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2967 { \
2968  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2969  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2970  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2971  _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2972  v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1/*, mode*/); \
2973 }
2974 
2975 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
2976 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
2977 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
2978 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
2979 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
2980 OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
2981 
2982 //
2983 // FP16
2984 //
2985 
2986 inline v_float32x8 v256_load_expand(const hfloat* ptr)
2987 {
2988 #if CV_FP16
2989  //1-load128, 2-permi, 3-cvt
2990  return v_float32x8(__lasx_xvfcvtl_s_h(__lasx_xvpermi_d(__lsx_vld((const __m128i*)ptr, 0), 0x10)));
2991 #else
2992  float CV_DECL_ALIGNED(32) buf[8];
2993  for (int i = 0; i < 8; i++)
2994  buf[i] = (float)ptr[i];
2995  return v256_load_aligned(buf);
2996 #endif
2997 }
2998 
2999 inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
3000 {
3001 #if CV_FP16
3002  __m256i ah = __lasx_xvfcvt_h_s(a.val, a.val);
3003  __lsx_vst((_m128i)ah, ptr, 0);
3004 #else
3005  float CV_DECL_ALIGNED(32) buf[8];
3006  v_store_aligned(buf, a);
3007  for (int i = 0; i < 8; i++)
3008  ptr[i] = hfloat(buf[i]);
3009 #endif
3010 }
3011 
3012 //
3013 // end of FP16
3014 //
3015 
3016 inline void v256_cleanup() {}
3017 
3018 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3019 
3021 
3022 } // cv::
3023 
3024 #endif // OPENCV_HAL_INTRIN_LASX_HPP
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition: intrin_cpp.hpp:1515
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition: intrin_cpp.hpp:1554
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2216
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition: intrin_cpp.hpp:2761
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition: intrin_cpp.hpp:1496
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1451
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
StoreMode
Definition: intrin.hpp:100
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
static CV__DEBUG_NS_BEGIN void swap(MatExpr &a, MatExpr &b)
Definition: mat.inl.hpp:3409
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition: dualquaternion.inl.hpp:274