EstervQrCode 1.1.1
Library for qr code manipulation
intrin_lsx.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4 
5 #ifndef OPENCV_HAL_INTRIN_LSX_HPP
6 #define OPENCV_HAL_INTRIN_LSX_HPP
7 
8 #include <lsxintrin.h>
9 
10 #define CV_SIMD128 1
11 #define CV_SIMD128_64F 1
12 #define CV_SIMD128_FP16 0
13 
14 namespace cv
15 {
16 
18 
19 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
20 
22 
23 inline __m128i _v128_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6,
24  char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15)
25 {
26  return (__m128i)v16i8{ v0, v1, v2, v3, v4, v5, v6, v7,
27  v8, v9, v10, v11, v12, v13, v14, v15 };
28 }
29 
30 inline __m128i _v128_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6,
31  char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15)
32 {
33  return (__m128i)v16i8{ v15, v14, v13, v12, v11, v10, v9, v8,
34  v7, v6, v5, v4, v3, v2, v1, v0 };
35 }
36 
37 inline __m128i _v128_setr_h(short v0, short v1, short v2, short v3, short v4, short v5,
38  short v6, short v7)
39 {
40  return (__m128i)v8i16{ v0, v1, v2, v3, v4, v5, v6, v7 };
41 }
42 
43 inline __m128i _v128_setr_w(int v0, int v1, int v2, int v3)
44 {
45  return (__m128i)v4i32{ v0, v1, v2, v3 };
46 }
47 
48 inline __m128i _v128_set_w(int v0, int v1, int v2, int v3)
49 {
50  return (__m128i)v4i32{ v3, v2, v1, v0 };
51 }
52 
53 inline __m128i _v128_setall_w(int v0)
54 {
55  return __lsx_vreplgr2vr_w(v0);
56 }
57 
58 inline __m128i _v128_setr_d(int64 v0, int64 v1)
59 {
60  return (__m128i)v2i64{ v0, v1 };
61 }
62 
63 inline __m128i _v128_set_d(int64 v0, int64 v1)
64 {
65  return (__m128i)v2i64{ v1, v0 };
66 }
67 
68 inline __m128 _v128_setr_ps(float v0, float v1, float v2, float v3)
69 {
70  return (__m128)v4f32{ v0, v1, v2, v3 };
71 }
72 
73 inline __m128 _v128_setall_ps(float v0)
74 {
75  return (__m128)v4f32{ v0, v0, v0, v0 };
76 }
77 
78 inline __m128d _v128_setr_pd(double v0, double v1)
79 {
80  return (__m128d)v2f64{ v0, v1 };
81 }
82 
83 inline __m128d _v128_setall_pd(double v0)
84 {
85  return (__m128d)v2f64{ v0, v0 };
86 }
87 
88 inline __m128i _lsx_packus_h(const __m128i& a, const __m128i& b)
89 {
90  return __lsx_vssrarni_bu_h(b, a, 0);
91 }
92 
93 inline __m128i _lsx_packs_h(const __m128i& a, const __m128i& b)
94 {
95  return __lsx_vssrarni_b_h(b, a, 0);
96 }
97 
98 inline __m128i _lsx_packus_w(const __m128i& a, const __m128i& b)
99 {
100  return __lsx_vssrarni_hu_w(b, a, 0);
101 }
102 
104 
105 struct v_uint8x16
106 {
107  typedef uchar lane_type;
108  enum { nlanes = 16};
109 
110  v_uint8x16() {}
111  explicit v_uint8x16(__m128i v): val(v) {}
112  v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
113  uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
114  {
115  val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
116  }
117 
118  uchar get0() const
119  {
120  return (uchar)__lsx_vpickve2gr_bu(val, 0);
121  }
122 
123  __m128i val;
124 };
125 
126 struct v_int8x16
127 {
128  typedef schar lane_type;
129  enum { nlanes = 16 };
130 
131  v_int8x16() {}
132  explicit v_int8x16(__m128i v) : val(v) {}
133  v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
134  schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
135  {
136  val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
137  }
138 
139  schar get0() const
140  {
141  return (schar)__lsx_vpickve2gr_b(val, 0);
142  }
143 
144  __m128i val;
145 };
146 
147 struct v_uint16x8
148 {
149  typedef ushort lane_type;
150  enum { nlanes = 8 };
151 
152  v_uint16x8() {}
153  explicit v_uint16x8(__m128i v) : val(v) {}
154  v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
155  {
156  val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
157  }
158 
159  ushort get0() const
160  {
161  return (ushort)__lsx_vpickve2gr_hu(val, 0);
162  }
163 
164  __m128i val;
165 };
166 
167 struct v_int16x8
168 {
169  typedef short lane_type;
170  enum { nlanes = 8 };
171 
172  v_int16x8() {}
173  explicit v_int16x8(__m128i v) : val(v) {}
174  v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
175  {
176  val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
177  }
178 
179  short get0() const
180  {
181  return (short)__lsx_vpickve2gr_h(val, 0);
182  }
183 
184  __m128i val;
185 };
186 
187 struct v_uint32x4
188 {
189  typedef unsigned lane_type;
190  enum { nlanes = 4 };
191 
192  v_uint32x4() {}
193  explicit v_uint32x4(__m128i v) : val(v) {}
194  v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
195  {
196  val = _v128_setr_w(v0, v1, v2, v3);
197  }
198 
199  unsigned get0() const
200  {
201  return (unsigned)__lsx_vpickve2gr_wu(val, 0);
202  }
203 
204  __m128i val;
205 };
206 
207 struct v_int32x4
208 {
209  typedef int lane_type;
210  enum { nlanes = 4 };
211 
212  v_int32x4() {}
213  explicit v_int32x4(__m128i v) : val(v) {}
214  v_int32x4(int v0, int v1, int v2, int v3)
215  {
216  val = _v128_setr_w(v0, v1, v2, v3);
217  }
218 
219  int get0() const
220  {
221  return (int)__lsx_vpickve2gr_w(val, 0);
222  }
223 
224  __m128i val;
225 };
226 
227 struct v_float32x4
228 {
229  typedef float lane_type;
230  enum { nlanes = 4};
231 
232  v_float32x4() {}
233  explicit v_float32x4(__m128 v) : val(v) {}
234  explicit v_float32x4(__m128i v) { val = *((__m128*)&v); }
235  v_float32x4(float v0, float v1, float v2, float v3)
236  {
237  val = _v128_setr_ps(v0, v1, v2, v3);
238  }
239 
240  float get0() const
241  {
242  union { int iv; float fv; } d;
243  d.iv = __lsx_vpickve2gr_w(val, 0);
244  return d.fv;
245  }
246 
247  int get0toint() const
248  {
249  __m128i result = __lsx_vftintrz_w_s(val);
250  return (int)__lsx_vpickve2gr_w(result, 0);
251  }
252 
253  __m128 val;
254 };
255 
256 struct v_uint64x2
257 {
258  typedef uint64 lane_type;
259  enum { nlanes = 2};
260 
261  v_uint64x2() {}
262  explicit v_uint64x2(__m128i v) : val(v) {}
263  v_uint64x2(uint64 v0, uint64 v1)
264  {
265  val = _v128_setr_d(v0, v1);
266  }
267 
268  uint64 get0() const
269  {
270  return __lsx_vpickve2gr_du(val, 0);
271  }
272 
273  __m128i val;
274 };
275 
276 struct v_int64x2
277 {
278  typedef int64 lane_type;
279  enum { nlanes = 2};
280 
281  v_int64x2() {}
282  explicit v_int64x2(__m128i v) : val(v) {}
283  v_int64x2(int64 v0, int64 v1)
284  {
285  val = _v128_setr_d(v0, v1);
286  }
287 
288  uint64 get0() const
289  {
290  return __lsx_vpickve2gr_d(val, 0);
291  }
292 
293  __m128i val;
294 };
295 
296 struct v_float64x2
297 {
298  typedef double lane_type;
299  enum { nlanes = 2};
300 
301  v_float64x2() {}
302  explicit v_float64x2(__m128d v) : val(v) {}
303  explicit v_float64x2(__m128i v) { val = *((__m128d*)&v); }
304  v_float64x2(double v0, double v1)
305  {
306  val = _v128_setr_pd(v0, v1);
307  }
308 
309  double get0() const
310  {
311  union { int64 iv; double fv; } d;
312  d.iv = __lsx_vpickve2gr_d(val, 0);
313  return d.fv;
314  }
315 
316  int64 get0toint64() const
317  {
318  __m128i result = __lsx_vftintrz_l_d(val);
319  return (int64)__lsx_vpickve2gr_d(result, 0);
320  }
321 
322  __m128d val;
323 };
324 
326 
327 #define OPENCV_HAL_IMPL_LSX_LOADSTORE(_Tpvec, _Tp) \
328  inline _Tpvec v_load(const _Tp* ptr) \
329  { return _Tpvec(__lsx_vld(ptr, 0)); } \
330  inline _Tpvec v_load_aligned(const _Tp* ptr) \
331  { return _Tpvec(__lsx_vld(ptr, 0)); } \
332  inline _Tpvec v_load_low(const _Tp* ptr) \
333  { return _Tpvec(__lsx_vldrepl_d(ptr, 0)); } \
334  inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
335  { \
336  __m128i vl = __lsx_vldrepl_d(ptr0, 0); \
337  __m128i vh = __lsx_vldrepl_d(ptr1, 0); \
338  return _Tpvec(__lsx_vilvl_d(vh, vl)); \
339  } \
340  inline void v_store(_Tp* ptr, const _Tpvec& a) \
341  { __lsx_vst(a.val, ptr, 0); } \
342  inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
343  { __lsx_vst(a.val, ptr, 0); } \
344  inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
345  { __lsx_vst(a.val, ptr, 0); } \
346  inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
347  { \
348  if ( mode == hal::STORE_UNALIGNED) \
349  __lsx_vst(a.val, ptr, 0); \
350  else if ( mode == hal::STORE_ALIGNED_NOCACHE) \
351  __lsx_vst(a.val, ptr, 0); \
352  else \
353  __lsx_vst(a.val, ptr, 0); \
354  } \
355  inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
356  { __lsx_vstelm_d(a.val, ptr, 0, 0); } \
357  inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
358  { __lsx_vstelm_d(a.val, ptr, 0, 1); } \
359 
360 OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint8x16, uchar)
361 OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int8x16, schar)
362 OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint16x8, ushort)
363 OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int16x8, short)
364 OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint32x4, unsigned)
365 OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int32x4, int)
366 OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint64x2, uint64)
367 OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int64x2, int64)
368 
369 #define OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg) \
370  inline _Tpvec v_load(const _Tp* ptr) \
371  { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); } \
372  inline _Tpvec v_load_aligned(const _Tp* ptr) \
373  { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); } \
374  inline _Tpvec v_load_low(const _Tp* ptr) \
375  { return _Tpvec((halfreg)__lsx_vldrepl_d(ptr, 0)); } \
376  inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
377  { \
378  __m128i vl = __lsx_vldrepl_d(ptr0, 0); \
379  __m128i vh = __lsx_vldrepl_d(ptr1, 0); \
380  return _Tpvec((halfreg)__lsx_vilvl_d(vh, vl)); \
381  } \
382  inline void v_store(_Tp* ptr, const _Tpvec& a) \
383  { __lsx_vst((__m128i)a.val, ptr, 0); } \
384  inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
385  { __lsx_vst((__m128i)a.val, ptr, 0); } \
386  inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
387  { __lsx_vst((__m128i)a.val, ptr, 0); } \
388  inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
389  { \
390  if( mode == hal::STORE_UNALIGNED) \
391  __lsx_vst((__m128i)a.val, ptr, 0); \
392  else if( mode == hal::STORE_ALIGNED_NOCACHE) \
393  __lsx_vst((__m128i)a.val, ptr, 0); \
394  else \
395  __lsx_vst((__m128i)a.val, ptr, 0); \
396  } \
397  inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
398  { __lsx_vstelm_d((__m128i)a.val, ptr, 0, 0); } \
399  inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
400  { __lsx_vstelm_d((__m128i)a.val, ptr, 0, 1); } \
401 
402 OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float32x4, float, __m128)
403 OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float64x2, double, __m128d)
404 
405 inline __m128i _lsx_128_castps_si128(const __m128& v)
406 { return __m128i(v); }
407 
408 inline __m128i _lsx_128_castpd_si128(const __m128d& v)
409 { return __m128i(v); }
410 
411 #define OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
412  inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
413  { return _Tpvec(cast(a.val)); }
414 
415 #define OPENCV_HAL_IMPL_LSX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
416  inline _Tpvec v_setzero_##suffix() \
417  { return _Tpvec(__lsx_vldi(0)); } \
418  inline _Tpvec v_setall_##suffix(_Tp v) \
419  { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); } \
420  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, OPENCV_HAL_NOP) \
421  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, OPENCV_HAL_NOP) \
422  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, OPENCV_HAL_NOP) \
423  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8, suffix, OPENCV_HAL_NOP) \
424  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4, suffix, OPENCV_HAL_NOP) \
425  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4, suffix, OPENCV_HAL_NOP) \
426  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2, suffix, OPENCV_HAL_NOP) \
427  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2, suffix, OPENCV_HAL_NOP) \
428  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float32x4, suffix, _lsx_128_castps_si128) \
429  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float64x2, suffix, _lsx_128_castpd_si128) \
430 
431 OPENCV_HAL_IMPL_LSX_INIT(v_uint8x16, uchar, u8, b, int)
432 OPENCV_HAL_IMPL_LSX_INIT(v_int8x16, schar, s8, b, int)
433 OPENCV_HAL_IMPL_LSX_INIT(v_uint16x8, ushort, u16, h, int)
434 OPENCV_HAL_IMPL_LSX_INIT(v_int16x8, short, s16, h, int)
435 OPENCV_HAL_IMPL_LSX_INIT(v_uint32x4, unsigned, u32, w, int)
436 OPENCV_HAL_IMPL_LSX_INIT(v_int32x4, int, s32, w, int)
437 OPENCV_HAL_IMPL_LSX_INIT(v_uint64x2, uint64, u64, d, long int)
438 OPENCV_HAL_IMPL_LSX_INIT(v_int64x2, int64, s64, d, long int)
439 
440 inline __m128 _lsx_128_castsi128_ps(const __m128i &v)
441 { return __m128(v); }
442 
443 inline __m128d _lsx_128_castsi128_pd(const __m128i &v)
444 { return __m128d(v); }
445 
446 #define OPENCV_HAL_IMPL_LSX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
447  inline _Tpvec v_setzero_##suffix() \
448  { return _Tpvec(__lsx_vldi(0)); } \
449  inline _Tpvec v_setall_##suffix(_Tp v) \
450  { return _Tpvec(_v128_setall_##zsuffix(v)); } \
451  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, cast) \
452  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, cast) \
453  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, cast) \
454  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8, suffix, cast) \
455  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4, suffix, cast) \
456  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4, suffix, cast) \
457  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2, suffix, cast) \
458  OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2, suffix, cast) \
459 
460 OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float32x4, float, f32, ps, _lsx_128_castsi128_ps)
461 OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float64x2, double, f64, pd, _lsx_128_castsi128_pd)
462 
463 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a)
464 { return a; }
465 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a)
466 { return v_float32x4(_lsx_128_castps_si128(__m128(a.val))); }
467 
468 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a)
469 { return a; }
470 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a)
471 { return v_float64x2(_lsx_128_castpd_si128(__m128d(a.val))); }
472 
474 
475 // unpacks
476 #define OPENCV_HAL_IMPL_LSX_UNPACK(_Tpvec, suffix) \
477  inline _Tpvec v128_unpacklo(const _Tpvec& a, const _Tpvec& b) \
478  { return _Tpvec(__lsx_vilvl_##suffix(__m128i(b.val), __m128i(a.val))); } \
479  inline _Tpvec v128_unpackhi(const _Tpvec& a, const _Tpvec& b) \
480  { return _Tpvec(__lsx_vilvh_##suffix(__m128i(b.val), __m128i(a.val))); } \
481 
482 OPENCV_HAL_IMPL_LSX_UNPACK(v_uint8x16, b)
483 OPENCV_HAL_IMPL_LSX_UNPACK(v_int8x16, b)
484 OPENCV_HAL_IMPL_LSX_UNPACK(v_uint16x8, h)
485 OPENCV_HAL_IMPL_LSX_UNPACK(v_int16x8, h)
486 OPENCV_HAL_IMPL_LSX_UNPACK(v_uint32x4, w)
487 OPENCV_HAL_IMPL_LSX_UNPACK(v_int32x4, w)
488 OPENCV_HAL_IMPL_LSX_UNPACK(v_uint64x2, d)
489 OPENCV_HAL_IMPL_LSX_UNPACK(v_int64x2, d)
490 OPENCV_HAL_IMPL_LSX_UNPACK(v_float32x4, w)
491 OPENCV_HAL_IMPL_LSX_UNPACK(v_float64x2, d)
492 
493 //ZIP
494 #define OPENCV_HAL_IMPL_LSX_ZIP(_Tpvec) \
495  inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
496  { return (_Tpvec)__lsx_vilvl_d((__m128i)b.val, (__m128i)a.val); } \
497  inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
498  { return (_Tpvec)__lsx_vilvh_d((__m128i)b.val, (__m128i)a.val); } \
499  inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
500  _Tpvec& c, _Tpvec& d) \
501  { \
502  __m128i a1 = (__m128i)a.val, b1 = (__m128i)b.val; \
503  c = _Tpvec(__lsx_vilvl_d(b1, a1)); \
504  d = _Tpvec(__lsx_vilvh_d(b1, a1)); \
505  } \
506  inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
507  _Tpvec& ab0, _Tpvec& ab1) \
508  { \
509  ab0 = v128_unpacklo(a, b); \
510  ab1 = v128_unpackhi(a, b); \
511  }
512 
513 OPENCV_HAL_IMPL_LSX_ZIP(v_uint8x16)
514 OPENCV_HAL_IMPL_LSX_ZIP(v_int8x16)
515 OPENCV_HAL_IMPL_LSX_ZIP(v_uint16x8)
516 OPENCV_HAL_IMPL_LSX_ZIP(v_int16x8)
517 OPENCV_HAL_IMPL_LSX_ZIP(v_uint32x4)
518 OPENCV_HAL_IMPL_LSX_ZIP(v_int32x4)
519 OPENCV_HAL_IMPL_LSX_ZIP(v_uint64x2)
520 OPENCV_HAL_IMPL_LSX_ZIP(v_int64x2)
521 OPENCV_HAL_IMPL_LSX_ZIP(v_float32x4)
522 OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2)
523 
524 
527 #define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin) \
528  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
529  { return _Tpvec(intrin(a.val, b.val)); } \
530  inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
531  { a.val = intrin(a.val, b.val); return a; }
532 
533 OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16, __lsx_vsadd_bu)
534 OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16, __lsx_vssub_bu)
535 OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16, __lsx_vsadd_b)
536 OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16, __lsx_vssub_b)
537 OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8, __lsx_vsadd_hu)
538 OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8, __lsx_vssub_hu)
539 OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8, __lsx_vsadd_h)
540 OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8, __lsx_vssub_h)
541 OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4, __lsx_vadd_w)
542 OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4, __lsx_vsub_w)
543 OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4, __lsx_vmul_w)
544 OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4, __lsx_vadd_w)
545 OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4, __lsx_vsub_w)
546 OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4, __lsx_vmul_w)
547 OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2, __lsx_vadd_d)
548 OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2, __lsx_vsub_d)
549 OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2, __lsx_vadd_d)
550 OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2, __lsx_vsub_d)
551 
552 OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s)
553 OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s)
554 OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s)
555 OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s)
556 OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d)
557 OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d)
558 OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d)
559 OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d)
560 
561 // saturating multiply 8-bit, 16-bit
562 inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
563 {
564  v_uint16x8 c, d;
565  v_mul_expand(a, b, c, d);
566  return v_pack(c, d);
567 }
568 inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b)
569 {
570  v_int16x8 c, d;
571  v_mul_expand(a, b, c, d);
572  return v_pack(c, d);
573 }
574 inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
575 {
576  __m128i a0 = a.val, b0 = b.val;
577  __m128i pev = __lsx_vmulwev_w_hu(a0, b0);
578  __m128i pod = __lsx_vmulwod_w_hu(a0, b0);
579  __m128i pl = __lsx_vilvl_w(pod, pev);
580  __m128i ph = __lsx_vilvh_w(pod, pev);
581  return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
582 }
583 inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
584 {
585  __m128i a0 = a.val, b0 = b.val;
586  __m128i pev = __lsx_vmulwev_w_h(a0, b0);
587  __m128i pod = __lsx_vmulwod_w_h(a0, b0);
588  __m128i pl = __lsx_vilvl_w(pod, pev);
589  __m128i ph = __lsx_vilvh_w(pod, pev);
590  return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
591 }
592 inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
593 { a = a * b; return a; }
594 inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b)
595 { a = a * b; return a; }
596 inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b)
597 { a = a * b; return a; }
598 inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b)
599 { a = a * b; return a; }
600 
603 #define OPENCV_HAL_IMPL_LSX_BIN_FUNC(func, _Tpvec, intrin) \
604  inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
605  { return _Tpvec(intrin(a.val, b.val)); } \
606 
607 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint8x16, __lsx_vadd_b)
608 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int8x16, __lsx_vadd_b)
609 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint16x8, __lsx_vadd_h)
610 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int16x8, __lsx_vadd_h)
611 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint8x16, __lsx_vsub_b)
612 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int8x16, __lsx_vsub_b)
613 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint16x8, __lsx_vsub_h)
614 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int16x8, __lsx_vsub_h)
615 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_uint16x8, __lsx_vmul_h)
616 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_int16x8, __lsx_vmul_h)
617 
618 inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
619 {
620  __m128i a0 = a.val, b0 = b.val;
621  __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
622  __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
623  return v_uint8x16(__lsx_vpackev_b(p1, p0));
624 }
625 
626 inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
627 {
628  return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
629 }
630 
631 // Multiply and expand
632 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
633  v_uint16x8& c, v_uint16x8& d)
634 {
635  __m128i a0 = a.val, b0 = b.val;
636  __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
637  __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
638  c.val = __lsx_vilvl_h(p1, p0);
639  d.val = __lsx_vilvh_h(p1, p0);
640 }
641 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
642  v_int16x8& c, v_int16x8& d)
643 {
644  __m128i a0 = a.val, b0 = b.val;
645  __m128i p0 = __lsx_vmulwev_h_b(a0, b0);
646  __m128i p1 = __lsx_vmulwod_h_b(a0, b0);
647  c.val = __lsx_vilvl_h(p1, p0);
648  d.val = __lsx_vilvh_h(p1, p0);
649 }
650 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
651  v_int32x4& c, v_int32x4& d)
652 {
653  __m128i a0 = a.val, b0 = b.val;
654  __m128i p0 = __lsx_vmulwev_w_h(a0, b0);
655  __m128i p1 = __lsx_vmulwod_w_h(a0, b0);
656  c.val = __lsx_vilvl_w(p1, p0);
657  d.val = __lsx_vilvh_w(p1, p0);
658 }
659 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
660  v_uint32x4& c, v_uint32x4& d)
661 {
662  __m128i a0 = a.val, b0 = b.val;
663  __m128i p0 = __lsx_vmulwev_w_hu(a0, b0);
664  __m128i p1 = __lsx_vmulwod_w_hu(a0, b0);
665  c.val = __lsx_vilvl_w(p1, p0);
666  d.val = __lsx_vilvh_w(p1, p0);
667 }
668 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
669  v_uint64x2& c, v_uint64x2& d)
670 {
671  __m128i a0 = a.val, b0 = b.val;
672  __m128i p0 = __lsx_vmulwev_d_wu(a0, b0);
673  __m128i p1 = __lsx_vmulwod_d_wu(a0, b0);
674  c.val = __lsx_vilvl_d(p1, p0);
675  d.val = __lsx_vilvh_d(p1, p0);
676 }
677 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
678 { return v_int16x8(__lsx_vmuh_h(a.val, b.val)); }
679 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
680 { return v_uint16x8(__lsx_vmuh_hu(a.val, b.val)); }
681 
683 #define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
684  inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
685  { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
686  inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
687  { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
688  inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
689  { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
690  inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
691  { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
692  template<int imm> \
693  inline _Tpuvec v_shl(const _Tpuvec& a) \
694  { return _Tpuvec(__lsx_vslli_##suffix(a.val, imm)); } \
695  template<int imm> \
696  inline _Tpsvec v_shl(const _Tpsvec& a) \
697  { return _Tpsvec(__lsx_vslli_##suffix(a.val, imm)); } \
698  template<int imm> \
699  inline _Tpuvec v_shr(const _Tpuvec& a) \
700  { return _Tpuvec(__lsx_vsrli_##suffix(a.val, imm)); } \
701  template<int imm> \
702  inline _Tpsvec v_shr(const _Tpsvec& a) \
703  { return _Tpsvec(__lsx_vsrai_##suffix(a.val, imm)); } \
704 
705 OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint16x8, v_int16x8, h, __lsx_vsra_h)
706 OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint32x4, v_int32x4, w, __lsx_vsra_w)
707 OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d)
708 
709 
710 #define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix) \
711  OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix) \
712  OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix) \
713  OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix) \
714  inline _Tpvec operator ~(const _Tpvec& a) \
715  { return _Tpvec(__lsx_vnori_b(a.val, 0)); } \
716 
717 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16, v)
718 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int8x16, v)
719 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint16x8, v)
720 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int16x8, v)
721 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint32x4, v)
722 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int32x4, v)
723 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2, v)
724 OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2, v)
725 
726 #define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
727  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
728  { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } \
729  inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
730  { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val); \
731  a.val = cast(c); \
732  return a;}
733 
734 #define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast) \
735  OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast) \
736  OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast) \
737  OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast) \
738  inline _Tpvec operator ~ (const _Tpvec& a) \
739  { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); } \
740 
741 OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps)
742 OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float64x2, _lsx_128_castsi128_pd)
743 
744 
745 #define OPENCV_HAL_IMPL_LSX_SELECT(_Tpvec) \
746  inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
747  { return _Tpvec(__lsx_vbitsel_v(b.val, a.val, mask.val)); } \
748 
749 OPENCV_HAL_IMPL_LSX_SELECT(v_uint8x16)
750 OPENCV_HAL_IMPL_LSX_SELECT(v_int8x16)
751 OPENCV_HAL_IMPL_LSX_SELECT(v_uint16x8)
752 OPENCV_HAL_IMPL_LSX_SELECT(v_int16x8)
753 OPENCV_HAL_IMPL_LSX_SELECT(v_uint32x4)
754 OPENCV_HAL_IMPL_LSX_SELECT(v_int32x4)
755 
756 inline v_float32x4 v_select(const v_float32x4 &mask, const v_float32x4 &a, const v_float32x4 &b)
757 { return v_float32x4(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); }
758 inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const v_float64x2 &b)
759 { return v_float64x2(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); }
760 
762 #define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec) \
763  inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
764  { return ~( a == b ); } \
765  inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
766  { return b > a ; } \
767  inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
768  { return ~(a < b); } \
769  inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
770  { return b >= a; } \
771 
772 #define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
773  inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
774  { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); } \
775  inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
776  { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); } \
777  inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
778  { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); } \
779  inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
780  { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); } \
781  OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec) \
782  OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
783 
784 OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint8x16, v_int8x16, b, bu)
785 OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8, v_int16x8, h, hu)
786 OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4, v_int32x4, w, wu)
787 
788 #define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix) \
789  inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
790  { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); } \
791  inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
792  { return ~(a == b); }
793 
794 OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d)
795 OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d)
796 
797 #define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
798  inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
799  { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); } \
800 
801 #define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix) \
802  OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix) \
803  OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix) \
804  OPENCV_HAL_IMPL_LSX_CMP_FLT(<, vfcmp_clt, _Tpvec, ssuffix) \
805  OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix) \
806 
807 OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s)
808 OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d)
809 
810 inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b)
811 { return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
812 
813 inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b)
814 { return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
815 
816 inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b)
817 { return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
818 
819 inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b)
820 { return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
821 
822 inline v_float32x4 v_not_nan(const v_float32x4& a)
823 { return v_float32x4(__lsx_vfcmp_cor_s(a.val, a.val)); }
824 
825 inline v_float64x2 v_not_nan(const v_float64x2& a)
826 { return v_float64x2(__lsx_vfcmp_cor_d(a.val, a.val)); }
827 
829 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint8x16, __lsx_vmin_bu)
830 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint8x16, __lsx_vmax_bu)
831 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int8x16, __lsx_vmin_b)
832 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int8x16, __lsx_vmax_b)
833 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint16x8, __lsx_vmin_hu)
834 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint16x8, __lsx_vmax_hu)
835 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int16x8, __lsx_vmin_h)
836 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int16x8, __lsx_vmax_h)
837 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint32x4, __lsx_vmin_wu)
838 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint32x4, __lsx_vmax_wu)
839 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int32x4, __lsx_vmin_w)
840 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int32x4, __lsx_vmax_w)
841 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float32x4, __lsx_vfmin_s)
842 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float32x4, __lsx_vfmax_s)
843 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float64x2, __lsx_vfmin_d)
844 OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float64x2, __lsx_vfmax_d)
845 
846 template <int imm,
847  bool is_invalid = ((imm < 0) || (imm > 16)),
848  bool is_first = (imm == 0),
849  bool is_half = (imm == 8),
850  bool is_second = (imm == 16),
851  bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
852 class v_lsx_palignr_u8_class;
853 
854 template <int imm>
855 class v_lsx_palignr_u8_class<imm, true, false, false, false, false>;
856 
857 template <int imm>
858 class v_lsx_palignr_u8_class<imm, false, true, false, false, false>
859 {
860 public:
861  inline __m128i operator()(const __m128i& a, const __m128i& b) const
862  {
863  CV_UNUSED(b);
864  return a;
865  }
866 };
867 
868 template <int imm>
869 class v_lsx_palignr_u8_class<imm, false, false, true, false, false>
870 {
871 public:
872  inline __m128i operator()(const __m128i& a, const __m128i& b) const
873  {
874  return __lsx_vshuf4i_d(a, b, 0x9);
875  }
876 };
877 
878 template <int imm>
879 class v_lsx_palignr_u8_class<imm, false, false, false, true, false>
880 {
881 public:
882  inline __m128i operator()(const __m128i& a, const __m128i& b) const
883  {
884  CV_UNUSED(a);
885  return b;
886  }
887 };
888 
889 template <int imm>
890 class v_lsx_palignr_u8_class<imm, false, false, false, false, true>
891 {
892 public:
893  inline __m128i operator()(const __m128i& a, const __m128i& b) const
894  {
895  enum { imm2 = (sizeof(__m128i) - imm) };
896  return __lsx_vor_v(__lsx_vbsrl_v(a, imm), __lsx_vbsll_v(b, imm2));
897  }
898 };
899 
900 template <int imm>
901 inline __m128i v_lsx_palignr_u8(const __m128i& a, const __m128i& b)
902 {
903  CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_lsx_palignr_u8");
904  return v_lsx_palignr_u8_class<imm>()(a, b);
905 }
907 #define OPENCV_HAL_IMPL_LSX_ROTATE_CAST(_Tpvec, cast) \
908  template<int imm> \
909  inline _Tpvec v_rotate_right(const _Tpvec &a) \
910  { \
911  enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
912  __m128i ret = __lsx_vbsrl_v((__m128i)a.val, imm2); \
913  return _Tpvec(cast(ret)); \
914  } \
915  template<int imm> \
916  inline _Tpvec v_rotate_left(const _Tpvec &a) \
917  { \
918  enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
919  __m128i ret = __lsx_vbsll_v((__m128i)a.val, imm2); \
920  return _Tpvec(cast(ret)); \
921  } \
922  template<int imm> \
923  inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
924  { \
925  enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
926  return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)a.val, (__m128i)b.val))); \
927  } \
928  template<int imm> \
929  inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
930  { \
931  enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type))}; \
932  return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)b.val, (__m128i)a.val))); \
933  }
934 
935 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint8x16, OPENCV_HAL_NOP) \
936 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int8x16, OPENCV_HAL_NOP) \
937 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint16x8, OPENCV_HAL_NOP) \
938 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int16x8, OPENCV_HAL_NOP) \
939 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint32x4, OPENCV_HAL_NOP) \
940 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int32x4, OPENCV_HAL_NOP) \
941 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint64x2, OPENCV_HAL_NOP) \
942 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int64x2, OPENCV_HAL_NOP) \
943 
944 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float32x4, _lsx_128_castsi128_ps)
945 OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float64x2, _lsx_128_castsi128_pd)
946 
947 
948 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
949 {
950  __m128i vec = __lsx_vshuf4i_b(a.val, 0x1B);
951  return v_uint8x16(__lsx_vshuf4i_w(vec, 0x1B));
952 }
953 
954 inline v_int8x16 v_reverse(const v_int8x16 &a)
955 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
956 
957 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
958 {
959  __m128i vec = __lsx_vshuf4i_h(a.val, 0x1B);
960  return v_uint16x8(__lsx_vshuf4i_w(vec, 0x4E));
961 }
962 
963 inline v_int16x8 v_reverse(const v_int16x8 &a)
964 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
965 
966 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
967 { return v_uint32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
968 
969 inline v_int32x4 v_reverse(const v_int32x4 &a)
970 { return v_int32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
971 
972 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
973 { return v_uint64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
974 
975 inline v_int64x2 v_reverse(const v_int64x2 &a)
976 { return v_int64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
977 
978 inline v_float32x4 v_reverse(const v_float32x4 &a)
979 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
980 
981 inline v_float64x2 v_reverse(const v_float64x2 &a)
982 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
983 
985 
987 // this function is return a[0]+a[1]+...+a[31]
988 inline unsigned v_reduce_sum(const v_uint8x16& a)
989 {
990  __m128i t1 = __lsx_vhaddw_hu_bu(a.val, a.val);
991  __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
992  __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
993  __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
994  return (unsigned)__lsx_vpickve2gr_w(t4, 0);
995 }
996 
997 inline int v_reduce_sum(const v_int8x16 &a)
998 {
999  __m128i t1 = __lsx_vhaddw_h_b(a.val, a.val);
1000  __m128i t2 = __lsx_vhaddw_w_h(t1, t1);
1001  __m128i t3 = __lsx_vhaddw_d_w(t2, t2);
1002  __m128i t4 = __lsx_vhaddw_q_d(t3, t3);
1003  return (int)__lsx_vpickve2gr_w(t4, 0);
1004 }
1005 
1006 #define OPENCV_HAL_IMPL_LSX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1007  inline sctype v_reduce_##func(const _Tpvec& a) \
1008  { \
1009  __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1010  val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1011  val = intrin(val, __lsx_vbsrl_v(val, 2)); \
1012  val = intrin(val, __lsx_vbsrl_v(val, 1)); \
1013  return (sctype)__lsx_vpickve2gr_b(val, 0); \
1014  }
1015 
1016 OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, min, __lsx_vmin_bu)
1017 OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, max, __lsx_vmax_bu)
1018 OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16, schar, min, __lsx_vmin_b)
1019 OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16, schar, max, __lsx_vmax_b)
1020 
1021 #define OPENCV_HAL_IMPL_LSX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1022  inline sctype v_reduce_##func(const _Tpvec &a) \
1023  { \
1024  __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1025  val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1026  val = intrin(val, __lsx_vbsrl_v(val, 2)); \
1027  return (sctype)__lsx_vpickve2gr_h(val, 0); \
1028  }
1029 
1030 OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, min, __lsx_vmin_hu)
1031 OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, max, __lsx_vmax_hu)
1032 OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8, short, min, __lsx_vmin_h)
1033 OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8, short, max, __lsx_vmax_h)
1034 
1035 #define OPENCV_HAL_IMPL_LSX_REDUCE_4(_Tpvec, sctype, func, intrin) \
1036  inline sctype v_reduce_##func(const _Tpvec &a) \
1037  { \
1038  __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1039  val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1040  return (sctype)__lsx_vpickve2gr_w(val, 0); \
1041  }
1042 
1043 OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, min, __lsx_vmin_wu)
1044 OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, max, __lsx_vmax_wu)
1045 OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4, int, min, __lsx_vmin_w)
1046 OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4, int, max, __lsx_vmax_w)
1047 
1048 #define OPENCV_HAL_IMPL_LSX_REDUCE_FLT(func, intrin) \
1049  inline float v_reduce_##func(const v_float32x4 &a) \
1050  { \
1051  __m128 val = a.val; \
1052  val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 8)); \
1053  val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 4)); \
1054  float *fval = (float*)&val; \
1055  return fval[0]; \
1056  }
1057 
1058 OPENCV_HAL_IMPL_LSX_REDUCE_FLT(min, __lsx_vfmin_s)
1059 OPENCV_HAL_IMPL_LSX_REDUCE_FLT(max, __lsx_vfmax_s)
1060 
1061 inline int v_reduce_sum(const v_int32x4 &a)
1062 {
1063  __m128i t1 = __lsx_vhaddw_d_w(a.val, a.val);
1064  __m128i t2 = __lsx_vhaddw_q_d(t1, t1);
1065  return (int)__lsx_vpickve2gr_w(t2, 0);
1066 }
1067 
1068 inline unsigned v_reduce_sum(const v_uint32x4 &a)
1069 {
1070  __m128i t1 = __lsx_vhaddw_du_wu(a.val, a.val);
1071  __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1072  return (int)__lsx_vpickve2gr_w(t2, 0);
1073 }
1074 
1075 inline int v_reduce_sum(const v_int16x8 &a)
1076 {
1077  __m128i t1 = __lsx_vhaddw_w_h(a.val, a.val);
1078  __m128i t2 = __lsx_vhaddw_d_w(t1, t1);
1079  __m128i t3 = __lsx_vhaddw_q_d(t2, t2);
1080  return (int)__lsx_vpickve2gr_w(t3, 0);
1081 }
1082 
1083 inline unsigned v_reduce_sum(const v_uint16x8 &a)
1084 {
1085  __m128i t1 = __lsx_vhaddw_wu_hu(a.val, a.val);
1086  __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1087  __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1088  return (int)__lsx_vpickve2gr_w(t3, 0);
1089 }
1090 
1091 inline float v_reduce_sum(const v_float32x4 &a)
1092 {
1093  __m128i val = (__m128i)a.val;
1094  val = __lsx_vbsrl_v(val, 8);
1095  __m128 result = __lsx_vfadd_s(a.val, (__m128)val);
1096  float *pa = (float*)&result;
1097  return (float)(pa[0] + pa[1]);
1098 }
1099 
1100 inline uint64 v_reduce_sum(const v_uint64x2 &a)
1101 {
1102  __m128i t0 = __lsx_vhaddw_qu_du(a.val, a.val);
1103  return (uint64)__lsx_vpickve2gr_du(t0, 0);
1104 }
1105 
1106 inline int64 v_reduce_sum(const v_int64x2 &a)
1107 {
1108  __m128i t0 = __lsx_vhaddw_q_d(a.val, a.val);
1109  return (int64)__lsx_vpickve2gr_d(t0, 0);
1110 }
1111 
1112 inline double v_reduce_sum(const v_float64x2 &a)
1113 {
1114  double *pa = (double*)&a;
1115  return pa[0] + pa[1];
1116 }
1117 
1118 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1119  const v_float32x4& c, const v_float32x4& d)
1120 {
1121  __m128i a0 = (__m128i)a.val;
1122  __m128i b0 = (__m128i)b.val;
1123  __m128i c0 = (__m128i)c.val;
1124  __m128i d0 = (__m128i)d.val;
1125  __m128i ac_l = __lsx_vilvl_w(c0, a0);
1126  __m128i ac_h = __lsx_vilvh_w(c0, a0);
1127  __m128i bd_l = __lsx_vilvl_w(d0, b0);
1128  __m128i bd_h = __lsx_vilvh_w(d0, b0);
1129  __m128 ac = __lsx_vfadd_s((__m128)ac_l, (__m128)ac_h);
1130  __m128 bd = __lsx_vfadd_s((__m128)bd_l, (__m128)bd_h);
1131  return v_float32x4(__lsx_vfadd_s((__m128)__lsx_vilvl_w((__m128i)bd, (__m128i)ac),
1132  (__m128)__lsx_vilvh_w((__m128i)bd, (__m128i)ac)));
1133 }
1134 
1135 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1136 {
1137  __m128i t0 = __lsx_vabsd_b(a.val, b.val);
1138  __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
1139  __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
1140  __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
1141  __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
1142  return (unsigned)__lsx_vpickve2gr_w(t4, 0);
1143 }
1144 
1145 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1146 {
1147  __m128i t0 = __lsx_vabsd_bu(a.val, b.val);
1148  __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
1149  __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
1150  __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
1151  __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
1152  return (unsigned)__lsx_vpickve2gr_w(t4, 0);
1153 }
1154 
1155 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1156 {
1157  __m128i t0 = __lsx_vabsd_hu(a.val, b.val);
1158  __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
1159  __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1160  __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1161  return (unsigned)__lsx_vpickve2gr_w(t3, 0);
1162 }
1163 
1164 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1165 {
1166  __m128i t0 = __lsx_vabsd_h(a.val, b.val);
1167  __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
1168  __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1169  __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1170  return (unsigned)__lsx_vpickve2gr_w(t3, 0);
1171 }
1172 
1173 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1174 {
1175  __m128i t0 = __lsx_vabsd_wu(a.val, b.val);
1176  __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
1177  __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1178  return (unsigned)__lsx_vpickve2gr_w(t2, 0);
1179 }
1180 
1181 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1182 {
1183  __m128i t0 = __lsx_vabsd_w(a.val, b.val);
1184  __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
1185  __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1186  return (unsigned)__lsx_vpickve2gr_w(t2, 0);
1187 }
1188 
1189 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1190 {
1191  v_float32x4 a_b = a - b;
1192  return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff)));
1193 }
1194 
1196 #define OPENCV_HAL_IMPL_LSX_POPCOUNT(_Tpvec, _Tp, suffix) \
1197 inline _Tpvec v_popcount(const _Tp& a) \
1198 { return _Tpvec(__lsx_vpcnt_##suffix(a.val)); }
1199 
1200 OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16, v_uint8x16, b);
1201 OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16, v_int8x16, b);
1202 OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8, v_uint16x8, h);
1203 OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8, v_int16x8, h);
1204 OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4, v_uint32x4, w);
1205 OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4, v_int32x4, w);
1206 OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2, v_uint64x2, d);
1207 OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2, v_int64x2, d);
1208 
1210 #define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
1211 inline tt reinterpret_int(ft x) { union {ft l; tt i;} v; v.l = x; return v.i; }
1212 OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
1213 OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
1214 OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
1215 OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
1216 OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
1217 OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
1218 OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
1219 OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
1220 OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
1221 OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
1222 
1223 inline int v_signmask(const v_int8x16& a)
1224 {
1225  __m128i result = __lsx_vmskltz_b(a.val);
1226  return __lsx_vpickve2gr_w(result, 0);
1227 }
1228 inline int v_signmask(const v_uint8x16& a)
1229 { return v_signmask(v_reinterpret_as_s8(a)) ;}
1230 
1231 inline int v_signmask(const v_int16x8 &a)
1232 {
1233  __m128i result = __lsx_vmskltz_h(a.val);
1234  return __lsx_vpickve2gr_w(result, 0);
1235 }
1236 inline int v_signmask(const v_uint16x8 &a)
1237 { return v_signmask(v_reinterpret_as_s16(a)); }
1238 
1239 inline int v_signmask(const v_uint32x4& a)
1240 {
1241  __m128i result = __lsx_vmskltz_w(a.val);
1242  return __lsx_vpickve2gr_w(result, 0);
1243 }
1244 inline int v_signmask(const v_int32x4& a)
1245 { return v_signmask(v_reinterpret_as_u32(a)); }
1246 
1247 inline int v_signmask(const v_uint64x2& a)
1248 {
1249  __m128i result = __lsx_vmskltz_d(a.val);
1250  return __lsx_vpickve2gr_w(result, 0);
1251 }
1252 inline int v_signmask(const v_int64x2& a)
1253 { return v_signmask(v_reinterpret_as_u64(a)); }
1254 
1255 inline int v_signmask(const v_float32x4& a)
1256 { return v_signmask(*(v_int32x4*)(&a)); }
1257 
1258 inline int v_signmask(const v_float64x2& a)
1259 { return v_signmask(*(v_int64x2*)(&a)); }
1260 
1261 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1262 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1263 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1264 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1265 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1266 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1267 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1268 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1269 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1270 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1271 
1273 #define OPENCV_HAL_IMPL_LSX_CHECK(_Tpvec, allmask) \
1274  inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1275  inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1276 OPENCV_HAL_IMPL_LSX_CHECK(v_uint8x16, 65535)
1277 OPENCV_HAL_IMPL_LSX_CHECK(v_int8x16, 65535)
1278 OPENCV_HAL_IMPL_LSX_CHECK(v_uint16x8, 255);
1279 OPENCV_HAL_IMPL_LSX_CHECK(v_int16x8, 255);
1280 OPENCV_HAL_IMPL_LSX_CHECK(v_uint32x4, 15)
1281 OPENCV_HAL_IMPL_LSX_CHECK(v_int32x4, 15)
1282 OPENCV_HAL_IMPL_LSX_CHECK(v_uint64x2, 3)
1283 OPENCV_HAL_IMPL_LSX_CHECK(v_int64x2, 3)
1284 OPENCV_HAL_IMPL_LSX_CHECK(v_float32x4, 15)
1285 OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3)
1286 
1288 
1290 #define OPENCV_HAL_IMPL_LSX_MULADD(_Tpvec, suffix) \
1291  inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1292  { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); } \
1293  inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec &b, const _Tpvec& c) \
1294  { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); } \
1295  inline _Tpvec v_sqrt(const _Tpvec& x) \
1296  { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); } \
1297  inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1298  { return v_fma(a, a, b * b); } \
1299  inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1300  { return v_sqrt(v_fma(a, a, b * b)); }
1301 
1302 OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s)
1303 OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d)
1304 
1305 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1306 { return v_int32x4(__lsx_vmadd_w(c.val, a.val, b.val)); }
1307 
1308 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1309 { return v_fma(a, b, c); }
1310 
1311 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1312 {
1313  return v_float32x4(__lsx_vfrsqrt_s(x.val));
1314 }
1315 
1316 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1317 {
1318  return v_float64x2(__lsx_vfrsqrt_d(x.val));
1319 }
1320 
1322 #define OPENCV_HAL_IMPL_LSX_ABS(_Tpvec, suffix) \
1323  inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1324  { return v_u##_Tpvec(__lsx_vabsd_##suffix(x.val, __lsx_vldi(0))); }
1325 
1326 OPENCV_HAL_IMPL_LSX_ABS(int8x16, b)
1327 OPENCV_HAL_IMPL_LSX_ABS(int16x8, h)
1328 OPENCV_HAL_IMPL_LSX_ABS(int32x4, w)
1329 
1330 inline v_float32x4 v_abs(const v_float32x4& x)
1331 { return v_float32x4(*((__m128i*)&x) & __lsx_vreplgr2vr_w(0x7fffffff)); }
1332 inline v_float64x2 v_abs(const v_float64x2& x)
1333 { return v_float64x2(*((__m128i*)&x) & __lsx_vreplgr2vr_d(0x7fffffffffffffff)); }
1334 
1337 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1338 { return (v_uint8x16)__lsx_vabsd_bu(a.val, b.val); }
1339 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1340 { return (v_uint16x8)__lsx_vabsd_hu(a.val, b.val); }
1341 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1342 { return (v_uint32x4)__lsx_vabsd_wu(a.val, b.val); }
1343 
1344 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1345 { return (v_uint8x16)__lsx_vabsd_b(a.val, b.val); }
1346 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1347 { return (v_uint16x8)__lsx_vabsd_h(a.val, b.val); }
1348 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1349 { return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
1350 
1351 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
1352 { return v_abs(a - b); }
1353 
1354 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
1355 { return v_abs(a - b); }
1356 
1358 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1359 {
1360  v_int8x16 d = a - b;
1361  v_int8x16 m = a < b;
1362  return (d ^ m) - m;
1363 }
1364 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1365 { return v_max(a, b) - v_min(a, b); }
1366 
1368 
1370 inline v_int32x4 v_round(const v_float32x4& a)
1371 { return v_int32x4(__lsx_vftint_w_s(a.val)); }
1372 
1373 inline v_int32x4 v_round(const v_float64x2& a)
1374 { return v_int32x4(__lsx_vftint_w_d(a.val, a.val)); }
1375 
1376 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1377 { return v_int32x4(__lsx_vftint_w_d(b.val, a.val)); }
1378 
1379 inline v_int32x4 v_trunc(const v_float32x4& a)
1380 { return v_int32x4(__lsx_vftintrz_w_s(a.val)); }
1381 
1382 inline v_int32x4 v_trunc(const v_float64x2& a)
1383 { return v_int32x4(__lsx_vftintrz_w_d(a.val, a.val)); }
1384 
1385 inline v_int32x4 v_floor(const v_float32x4& a)
1386 { return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrm_s(a.val)))); }
1387 
1388 inline v_int32x4 v_floor(const v_float64x2& a)
1389 { return v_trunc(v_float64x2(__lsx_vfrintrm_d(a.val))); }
1390 
1391 inline v_int32x4 v_ceil(const v_float32x4& a)
1392 { return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrp_s(a.val)))); }
1393 
1394 inline v_int32x4 v_ceil(const v_float64x2& a)
1395 { return v_trunc(v_float64x2(__lsx_vfrintrp_d(a.val))); }
1396 
1398 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1399 { return v_float32x4(__lsx_vffint_s_w(a.val)); }
1400 
1401 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1402 { return v_float32x4(__lsx_vfcvt_s_d(a.val, a.val)); }
1403 
1404 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1405 { return v_float32x4(__lsx_vfcvt_s_d(b.val, a.val)); }
1406 
1407 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1408 { return v_float64x2(__lsx_vffintl_d_w(a.val)); }
1409 
1410 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1411 { return v_float64x2(__lsx_vffinth_d_w(a.val)); }
1412 
1413 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1414 { return v_float64x2(__lsx_vfcvtl_d_s(a.val)); }
1415 
1416 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1417 { return v_float64x2(__lsx_vfcvth_d_s(a.val)); }
1418 
1419 inline v_float64x2 v_cvt_f64(const v_int64x2& v)
1420 { return v_float64x2(__lsx_vffint_d_l(v.val)); }
1421 
1422 
1424 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1425 {
1426  return v_int8x16(_v128_setr_b(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1427  tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]], tab[idx[8]],
1428  tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]],
1429  tab[idx[14]], tab[idx[15]]));
1430 }
1431 
1432 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1433 {
1434  return v_int8x16(_v128_setr_h(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]),
1435  *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]), *(const short*)(tab + idx[4]),
1436  *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
1437 }
1438 
1439 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1440 {
1441  return v_int8x16(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
1442  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
1443 }
1444 
1445 inline v_uint8x16 v_lut(const uchar* tab, const int* idx)
1446 { return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
1447 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx)
1448 { return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
1449 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx)
1450 { return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
1451 
1452 inline v_int16x8 v_lut(const short* tab, const int* idx)
1453 {
1454  return v_int16x8(_v128_setr_h(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1455  tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
1456 }
1457 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1458 {
1459  return v_int16x8(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
1460  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
1461 }
1462 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1463 {
1464  return v_int16x8(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1])));
1465 }
1466 
1467 inline v_uint16x8 v_lut(const ushort* tab, const int* idx)
1468 { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
1469 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx)
1470 { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
1471 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx)
1472 { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
1473 
1474 inline v_int32x4 v_lut(const int* tab, const int* idx)
1475 {
1476  return v_int32x4(_v128_setr_w(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
1477 }
1478 inline v_int32x4 v_lut_pairs(const int *tab, const int* idx)
1479 {
1480  return v_int32x4(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1])));
1481 }
1482 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1483 {
1484  return v_int32x4(__lsx_vld(tab + idx[0], 0));
1485 }
1486 
1487 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
1488 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
1489 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
1490 
1491 inline v_int64x2 v_lut(const int64_t* tab, const int *idx)
1492 {
1493  return v_int64x2(_v128_setr_d(tab[idx[0]], tab[idx[1]]));
1494 }
1495 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1496 {
1497  return v_int64x2(__lsx_vld(tab + idx[0], 0));
1498 }
1499 
1500 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1501 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1502 
1503 inline v_float32x4 v_lut(const float* tab, const int* idx)
1504 {
1505  return v_float32x4(_v128_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
1506 }
1507 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1508 {
1509  return v_float32x4((__m128)_v128_setr_pd(*(const double*)(tab + idx[0]), *(const double*)(tab + idx[1])));
1510 }
1511 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1512 {
1513  return v_float32x4((__m128)__lsx_vld(tab + idx[0], 0));
1514 }
1515 
1516 inline v_float64x2 v_lut(const double* tab, const int* idx)
1517 {
1518  return v_float64x2(_v128_setr_pd(tab[idx[0]], tab[idx[1]]));
1519 }
1520 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1521 {
1522  return v_float64x2((__m128d)__lsx_vld(tab + idx[0], 0));
1523 }
1524 
1525 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1526 {
1527  int *idx = (int*)&idxvec.val;
1528  return v_lut(tab, idx);
1529 }
1530 
1531 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1532 {
1533  return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
1534 }
1535 
1536 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1537 {
1538  const int *idx = (const int*)&idxvec.val;
1539  return v_lut(tab, idx);
1540 }
1541 
1542 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1543 {
1544  const int *idx = (const int*)&idxvec.val;
1545  return v_lut(tab, idx);
1546 }
1547 
1548 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1549 {
1550  const int *idx = (const int*)&idxvec.val;
1551  __m128i xy0 = __lsx_vld(tab + idx[0], 0);
1552  __m128i xy1 = __lsx_vld(tab + idx[1], 0);
1553  __m128i xy2 = __lsx_vld(tab + idx[2], 0);
1554  __m128i xy3 = __lsx_vld(tab + idx[3], 0);
1555  __m128i xy01 = __lsx_vilvl_d(xy1, xy0);
1556  __m128i xy23 = __lsx_vilvl_d(xy3, xy2);
1557  __m128i xxyy02 = __lsx_vilvl_w(xy23, xy01);
1558  __m128i xxyy13 = __lsx_vilvh_w(xy23, xy01);
1559  x = v_float32x4((__m128)__lsx_vilvl_w(xxyy13, xxyy02));
1560  y = v_float32x4((__m128)__lsx_vilvh_w(xxyy13, xxyy02));
1561 }
1562 
1563 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1564 {
1565  const int* idx = (const int*)&idxvec.val;
1566  __m128i xy0 = __lsx_vld(tab + idx[0], 0);
1567  __m128i xy1 = __lsx_vld(tab + idx[1], 0);
1568  x = v_float64x2((__m128d)__lsx_vilvl_d(xy1, xy0));
1569  y = v_float64x2((__m128d)__lsx_vilvh_d(xy1, xy0));
1570 }
1571 
1572 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
1573 {
1574  return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
1575  _v128_setr_d(0x0705060403010200, 0x0f0d0e0c0b090a08)));
1576 }
1577 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
1578 { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1579 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
1580 {
1581  return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
1582  _v128_setr_d(0x0703060205010400, 0x0f0b0e0a0d090c08)));
1583 }
1584 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
1585 { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1586 
1587 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
1588 {
1589  return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
1590  _v128_setr_d(0x0706030205040100, 0x0f0e0b0a0d0c0908)));
1591 }
1592 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
1593 { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1594 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
1595 {
1596  return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
1597  _v128_setr_d(0x0b0a030209080100, 0x0f0e07060d0c0504)));
1598 }
1599 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
1600 { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1601 
1602 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
1603 {
1604  return v_int32x4(__lsx_vshuf4i_w(vec.val, 0xd8));
1605 }
1606 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
1607 { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1608 
1609 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
1610 { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1611 
1612 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
1613 {
1614  __m128i zero = __lsx_vldi(0);
1615  return v_int8x16(__lsx_vshuf_b(zero, vec.val,
1616  _v128_set_d(0x1211100f0e0d0c0a, 0x0908060504020100)));
1617 }
1618 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
1619 { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1620 
1621 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
1622 {
1623  __m128i zero = __lsx_vldi(0);
1624  return v_int16x8(__lsx_vshuf_b(zero, vec.val,
1625  _v128_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100)));
1626 }
1627 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
1628 { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1629 
1630 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
1631 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
1632 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
1633 
1635 
1637 
1638 // 16 >> 32
1639 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
1640 {
1641  __m128i x = a.val, y = b.val;
1642  return v_int32x4(__lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y));
1643 }
1644 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1645 {
1646  __m128i x = a.val, y = b.val, z = c.val;
1647  __m128i t = __lsx_vmaddwev_w_h(z, x, y);
1648  return v_int32x4(__lsx_vmaddwod_w_h(t, x, y));
1649 }
1650 
1651 // 32 >> 64
1652 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
1653 {
1654  __m128i x = a.val, y = b.val;
1655  return v_int64x2(__lsx_vmaddwod_d_w(__lsx_vmulwev_d_w(x, y), x, y));
1656 }
1657 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1658 {
1659  __m128i x = a.val, y = b.val, z = c.val;
1660  __m128i t = __lsx_vmaddwev_d_w(z, x, y);
1661  return v_int64x2(__lsx_vmaddwod_d_w(t, x, y));
1662 }
1663 
1664 // 8 >> 32
1665 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
1666 {
1667  __m128i x = a.val, y = b.val;
1668  __m128i even = __lsx_vmulwev_h_bu(x, y);
1669  __m128i odd = __lsx_vmulwod_h_bu(x, y);
1670  __m128i prod0 = __lsx_vhaddw_wu_hu(even, even);
1671  __m128i prod1 = __lsx_vhaddw_wu_hu(odd, odd);
1672  return v_uint32x4(__lsx_vadd_w(prod0, prod1));
1673 }
1674 
1675 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1676 { return v_dotprod_expand(a, b) + c ;}
1677 
1678 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
1679 {
1680  __m128i x = a.val, y = b.val;
1681  __m128i even = __lsx_vmulwev_h_b(x, y);
1682  __m128i odd = __lsx_vmulwod_h_b(x, y);
1683  __m128i prod0 = __lsx_vhaddw_w_h(even, even);
1684  __m128i prod1 = __lsx_vhaddw_w_h(odd, odd);
1685  return v_int32x4(__lsx_vadd_w(prod0, prod1));
1686 }
1687 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1688 { return v_dotprod_expand(a, b) + c; }
1689 
1690 // 16 >> 64
1691 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
1692 {
1693  __m128i x = a.val, y = b.val;
1694  __m128i even = __lsx_vmulwev_w_hu(x, y);
1695  __m128i odd = __lsx_vmulwod_w_hu(x, y);
1696  __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
1697  __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
1698  return v_uint64x2(__lsx_vadd_d(prod0, prod1));
1699 }
1700 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1701 { return v_dotprod_expand(a, b) + c; }
1702 
1703 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
1704 {
1705  __m128i x = a.val, y = b.val;
1706  __m128i even = __lsx_vmulwev_w_h(x, y);
1707  __m128i odd = __lsx_vmulwod_w_h(x, y);
1708  __m128i prod0 = __lsx_vhaddw_d_w(even, even);
1709  __m128i prod1 = __lsx_vhaddw_d_w(odd, odd);
1710  return v_int64x2(__lsx_vadd_d(prod0, prod1));
1711 }
1712 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1713 { return v_dotprod_expand(a, b) + c; }
1714 
1715 //32 >> 64f
1716 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
1717 { return v_cvt_f64(v_dotprod(a, b)); }
1718 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1719 { return v_dotprod_expand(a, b) + c; }
1720 
1721 
1723 
1724 // 16 >> 32
1725 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
1726 { return v_dotprod(a, b); }
1727 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1728 { return v_dotprod(a, b, c); }
1729 
1730 // 32 >> 64
1731 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
1732 { return v_dotprod(a, b); }
1733 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1734 { return v_dotprod(a, b, c); }
1735 
1736 // 8 >> 32
1737 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1738 { return v_dotprod_expand(a, b); }
1739 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1740 { return v_dotprod_expand(a, b, c); }
1741 
1742 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1743 { return v_dotprod_expand(a, b); }
1744 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1745 { return v_dotprod_expand(a, b, c); }
1746 
1747 // 16 >> 64
1748 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1749 {
1750  __m128i x = a.val, y = b.val;
1751  __m128i even = __lsx_vmulwev_w_hu(x, y);
1752  __m128i odd = __lsx_vmulwod_w_hu(x, y);
1753  __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
1754  __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
1755  return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
1756 }
1757 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1758 { return v_dotprod_expand_fast(a, b) + c; }
1759 
1760 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1761 {
1762  __m128i x = a.val, y = b.val;
1763  __m128i prod = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y);
1764  __m128i sign = __lsx_vsrai_w(prod, 31);
1765  __m128i lo = __lsx_vilvl_w(sign, prod);
1766  __m128i hi = __lsx_vilvh_w(sign, prod);
1767  return v_int64x2(__lsx_vadd_d(lo, hi));
1768 }
1769 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1770 { return v_dotprod_expand_fast(a, b) + c; }
1771 
1772 // 32 >> 64f
1773 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1774 { return v_dotprod_expand(a, b); }
1775 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1776 { return v_dotprod_expand(a, b, c); }
1777 
1778 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
1779  const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3)
1780 {
1781  __m128i x = (__m128i)v.val;
1782  __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val);
1783  __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val);
1784  __m128 v2 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val);
1785  __m128 v3 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xFF), m3.val);
1786 
1787  return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), __lsx_vfadd_s(v2, v3)));
1788 }
1789 
1790 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
1791  const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& a)
1792 {
1793  __m128i x = (__m128i)v.val;
1794  __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val);
1795  __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val);
1796  __m128 v2 = __lsx_vfmadd_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val, a.val);
1797 
1798  return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), v2));
1799 }
1800 
1801 #define OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(_Tpvec, cast_from, cast_to) \
1802  inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1803  const _Tpvec& a2, const _Tpvec& a3, \
1804  _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1805  { \
1806  __m128i t0 = cast_from(__lsx_vilvl_w(a1.val, a0.val)); \
1807  __m128i t1 = cast_from(__lsx_vilvl_w(a3.val, a2.val)); \
1808  __m128i t2 = cast_from(__lsx_vilvh_w(a1.val, a0.val)); \
1809  __m128i t3 = cast_from(__lsx_vilvh_w(a3.val, a2.val)); \
1810  b0.val = cast_to(__lsx_vilvl_d(t1, t0)); \
1811  b1.val = cast_to(__lsx_vilvh_d(t1, t0)); \
1812  b2.val = cast_to(__lsx_vilvl_d(t3, t2)); \
1813  b3.val = cast_to(__lsx_vilvh_d(t3, t2)); \
1814  }
1815 
1816 OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_uint32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1817 OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_int32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1818 
1819 inline void v_transpose4x4(const v_float32x4& a0, const v_float32x4& a1,
1820  const v_float32x4& a2, const v_float32x4& a3,
1821  v_float32x4& b0, v_float32x4& b1, v_float32x4& b2, v_float32x4& b3)
1822 {
1823  __m128i vec0 = (__m128i)a0.val, vec1 = (__m128i)a1.val;
1824  __m128i vec2 = (__m128i)a2.val, vec3 = (__m128i)a3.val;
1825  __m128i t0 = __lsx_vilvl_w(vec1, vec0);
1826  __m128i t1 = __lsx_vilvl_w(vec3, vec2);
1827  __m128i t2 = __lsx_vilvh_w(vec1, vec0);
1828  __m128i t3 = __lsx_vilvh_w(vec3, vec2);
1829  b0.val = __m128(__lsx_vilvl_d(t1, t0));
1830  b1.val = __m128(__lsx_vilvh_d(t1, t0));
1831  b2.val = __m128(__lsx_vilvl_d(t3, t2));
1832  b3.val = __m128(__lsx_vilvh_d(t3, t2));
1833 }
1834 
1836 
1837 /* Expand */
1838 #define OPENCV_HAL_IMPL_LSX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin_lo, intrin_hi) \
1839  inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1840  { \
1841  b0.val = intrin_lo(a.val, 0); \
1842  b1.val = intrin_hi(a.val); \
1843  } \
1844  inline _Tpwvec v_expand_low(const _Tpvec& a) \
1845  { return _Tpwvec(intrin_lo(a.val, 0)); } \
1846  inline _Tpwvec v_expand_high(const _Tpvec& a) \
1847  { return _Tpwvec(intrin_hi(a.val)); } \
1848  inline _Tpwvec v_load_expand(const _Tp* ptr) \
1849  { \
1850  __m128i a = __lsx_vld(ptr, 0); \
1851  return _Tpwvec(intrin_lo(a, 0)); \
1852  }
1853 
1854 OPENCV_HAL_IMPL_LSX_EXPAND(v_uint8x16, v_uint16x8, uchar, __lsx_vsllwil_hu_bu, __lsx_vexth_hu_bu)
1855 OPENCV_HAL_IMPL_LSX_EXPAND(v_int8x16, v_int16x8, schar, __lsx_vsllwil_h_b, __lsx_vexth_h_b)
1856 OPENCV_HAL_IMPL_LSX_EXPAND(v_uint16x8, v_uint32x4, ushort, __lsx_vsllwil_wu_hu, __lsx_vexth_wu_hu)
1857 OPENCV_HAL_IMPL_LSX_EXPAND(v_int16x8, v_int32x4, short, __lsx_vsllwil_w_h, __lsx_vexth_w_h)
1858 OPENCV_HAL_IMPL_LSX_EXPAND(v_uint32x4, v_uint64x2, unsigned, __lsx_vsllwil_du_wu, __lsx_vexth_du_wu)
1859 OPENCV_HAL_IMPL_LSX_EXPAND(v_int32x4, v_int64x2, int, __lsx_vsllwil_d_w, __lsx_vexth_d_w)
1860 
1861 #define OPENCV_HAL_IMPL_LSX_EXPAND_Q(_Tpvec, _Tp, intrin_lo, intrin_hi) \
1862  inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1863  { \
1864  __m128i a = __lsx_vld(ptr, 0); \
1865  __m128i b = intrin_lo(a, 0); \
1866  return _Tpvec(intrin_hi(b, 0)); \
1867  }
1868 
1869 OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_uint32x4, uchar, __lsx_vsllwil_hu_bu, __lsx_vsllwil_wu_hu)
1870 OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_int32x4, schar, __lsx_vsllwil_h_b, __lsx_vsllwil_w_h)
1871 
1872 /* pack */
1873 // 16
1874 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
1875 { return v_int8x16(_lsx_packs_h(a.val, b.val)); }
1876 
1877 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
1878 { return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, 0)); }
1879 
1880 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
1881 { return v_uint8x16(_lsx_packus_h(a.val, b.val)); }
1882 
1883 inline void v_pack_store(schar* ptr, const v_int16x8& a)
1884 { v_store_low(ptr, v_pack(a, a)); }
1885 
1886 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
1887 { v_store_low(ptr, v_pack(a, a)); }
1888 
1889 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
1890 { v_store_low(ptr, v_pack_u(a, a)); }
1891 
1892 template<int n> inline
1893 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
1894 { return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, n)); }
1895 
1896 template<int n> inline
1897 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
1898 { __lsx_vstelm_d(__lsx_vssrlrni_bu_h(a.val, a.val, n), ptr, 0, 0); }
1899 
1900 template<int n> inline
1901 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
1902 { return v_uint8x16(__lsx_vssrarni_bu_h(b.val, a.val, n)); }
1903 
1904 template<int n> inline
1905 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
1906 { __lsx_vstelm_d(__lsx_vssrarni_bu_h(a.val, a.val, n), ptr, 0, 0); }
1907 
1908 template<int n> inline
1909 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
1910 { return v_int8x16(__lsx_vssrarni_b_h(b.val, a.val, n)); }
1911 
1912 template<int n> inline
1913 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
1914 { __lsx_vstelm_d(__lsx_vssrarni_b_h(a.val, a.val, n), ptr, 0, 0); }
1915 
1916 //32
1917 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
1918 { return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, 0)); }
1919 
1920 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
1921 { return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, 0)); }
1922 
1923 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
1924 { return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, 0)); }
1925 
1926 inline void v_pack_store(short* ptr, const v_int32x4& a)
1927 { v_store_low(ptr, v_pack(a, a)); }
1928 
1929 inline void v_pack_store(ushort *ptr, const v_uint32x4& a)
1930 { __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
1931 
1932 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
1933 { __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
1934 
1935 template<int n> inline
1936 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
1937 { return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, n)); }
1938 
1939 template<int n> inline
1940 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
1941 { __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, n), ptr, 0, 0); }
1942 
1943 template<int n> inline
1944 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
1945 { return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, n)); }
1946 
1947 template<int n> inline
1948 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
1949 { __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, n), ptr, 0, 0); }
1950 
1951 template<int n> inline
1952 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
1953 { return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, n)); }
1954 
1955 template<int n> inline
1956 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
1957 { __lsx_vstelm_d(__lsx_vssrarni_h_w(a.val, a.val, n), ptr, 0, 0); }
1958 
1959 // 64
1960 // Non-saturaing pack
1961 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
1962 { return v_uint32x4(__lsx_vpickev_w(b.val, a.val)); }
1963 
1964 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
1965 { return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
1966 
1967 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
1968 { __lsx_vstelm_d(__lsx_vshuf4i_w(a.val, 0x08), ptr, 0, 0); }
1969 
1970 inline void v_pack_store(int *ptr, const v_int64x2& a)
1971 { v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(a)); }
1972 
1973 template<int n> inline
1974 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
1975 { return v_uint32x4(__lsx_vsrlrni_w_d(b.val, a.val, n)); }
1976 
1977 template<int n> inline
1978 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
1979 { __lsx_vstelm_d(__lsx_vsrlrni_w_d(a.val, a.val, n), ptr, 0, 0); }
1980 
1981 template<int n> inline
1982 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
1983 { return v_int32x4(__lsx_vsrarni_w_d(b.val, a.val, n)); }
1984 
1985 template<int n> inline
1986 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
1987 { __lsx_vstelm_d(__lsx_vsrarni_w_d(a.val, a.val, n), ptr, 0, 0); }
1988 
1989 // pack boolean
1990 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1991 { return v_uint8x16(__lsx_vssrarni_b_h(b.val, a.val, 0)); }
1992 
1993 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1994  const v_uint32x4& c, const v_uint32x4& d)
1995 {
1996  __m128i ab = __lsx_vssrarni_h_w(b.val, a.val, 0);
1997  __m128i cd = __lsx_vssrarni_h_w(d.val, c.val, 0);
1998  return v_uint8x16(__lsx_vssrarni_b_h(cd, ab, 0));
1999 }
2000 
2001 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
2002  const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
2003  const v_uint64x2& g, const v_uint64x2& h)
2004 {
2005  __m128i ab = __lsx_vssrarni_w_d(b.val, a.val, 0);
2006  __m128i cd = __lsx_vssrarni_w_d(d.val, c.val, 0);
2007  __m128i ef = __lsx_vssrarni_w_d(f.val, e.val, 0);
2008  __m128i gh = __lsx_vssrarni_w_d(h.val, g.val, 0);
2009 
2010  __m128i abcd = __lsx_vssrarni_h_w(cd, ab, 0);
2011  __m128i efgh = __lsx_vssrarni_h_w(gh, ef, 0);
2012  return v_uint8x16(__lsx_vssrarni_b_h(efgh, abcd, 0));
2013 }
2014 
2015 /* Recombine */
2016 // its up there with load and store operations
2017 
2018 /* Extract */
2019 #define OPENCV_HAL_IMPL_LSX_EXTRACT(_Tpvec) \
2020  template<int s> \
2021  inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2022  { return v_rotate_right<s>(a, b); }
2023 
2024 OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint8x16)
2025 OPENCV_HAL_IMPL_LSX_EXTRACT(v_int8x16)
2026 OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint16x8)
2027 OPENCV_HAL_IMPL_LSX_EXTRACT(v_int16x8)
2028 OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint32x4)
2029 OPENCV_HAL_IMPL_LSX_EXTRACT(v_int32x4)
2030 OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint64x2)
2031 OPENCV_HAL_IMPL_LSX_EXTRACT(v_int64x2)
2032 OPENCV_HAL_IMPL_LSX_EXTRACT(v_float32x4)
2033 OPENCV_HAL_IMPL_LSX_EXTRACT(v_float64x2)
2034 
2035 #define OPENCV_HAL_IMPL_LSX_EXTRACT_N(_Tpvec, _Twvec, intrin) \
2036 template<int i> \
2037 inline _Twvec v_extract_n(const _Tpvec& a) \
2038 { return (_Twvec)intrin(a.val, i); }
2039 
2040 OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint8x16, uchar, __lsx_vpickve2gr_b)
2041 OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int8x16, schar, __lsx_vpickve2gr_b)
2042 OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint16x8, ushort, __lsx_vpickve2gr_h)
2043 OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int16x8, short, __lsx_vpickve2gr_h)
2044 OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint32x4, uint, __lsx_vpickve2gr_w)
2045 OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int32x4, int, __lsx_vpickve2gr_w)
2046 OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint64x2, uint64, __lsx_vpickve2gr_d)
2047 OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int64x2, int64, __lsx_vpickve2gr_d)
2048 
2049 template<int i>
2050 inline float v_extract_n(const v_float32x4& v)
2051 {
2052  union { uint iv; float fv; } d;
2053  d.iv = __lsx_vpickve2gr_w(v.val, i);
2054  return d.fv;
2055 }
2056 
2057 template<int i>
2058 inline double v_extract_n(const v_float64x2& v)
2059 {
2060  union { uint64 iv; double dv; } d;
2061  d.iv = __lsx_vpickve2gr_d(v.val, i);
2062  return d.dv;
2063 }
2064 
2065 template<int i>
2066 inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
2067 { return v_uint32x4(__lsx_vreplvei_w(a.val, i)); }
2068 
2069 template<int i>
2070 inline v_int32x4 v_broadcast_element(const v_int32x4& a)
2071 { return v_int32x4(__lsx_vreplvei_w(a.val, i)); }
2072 
2073 template<int i>
2075 { return v_float32x4((__m128)__lsx_vreplvei_w((__m128i)a.val, i)); }
2076 
2078 
2079 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
2080 {
2081  __m128i t0 = __lsx_vld(ptr, 0);
2082  __m128i t1 = __lsx_vld(ptr, 16);
2083 
2084  a.val = __lsx_vpickev_b(t1, t0);
2085  b.val = __lsx_vpickod_b(t1, t0);
2086 }
2087 
2088 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2089 {
2090  __m128i t0 = __lsx_vld(ptr, 0);
2091  __m128i t1 = __lsx_vld(ptr, 16);
2092  a.val = __lsx_vpickev_h(t1, t0);
2093  b.val = __lsx_vpickod_h(t1, t0);
2094 }
2095 
2096 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2097 {
2098  __m128i t0 = __lsx_vld(ptr, 0);
2099  __m128i t1 = __lsx_vld(ptr, 16);
2100  a.val = __lsx_vpickev_w(t1, t0);
2101  b.val = __lsx_vpickod_w(t1, t0);
2102 }
2103 
2104 inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b)
2105 {
2106  __m128i t0 = __lsx_vld(ptr, 0);
2107  __m128i t1 = __lsx_vld(ptr, 16);
2108  a.val = __lsx_vilvl_d(t1, t0);
2109  b.val = __lsx_vilvh_d(t1, t0);
2110 }
2111 
2112 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
2113 {
2114  __m128i t0 = __lsx_vld(ptr, 0);
2115  __m128i t1 = __lsx_vld(ptr, 16);
2116  __m128i t2 = __lsx_vld(ptr, 32);
2117  const __m128i shuff0 = _v128_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2118  const __m128i shuff1 = _v128_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2119  __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff0);
2120  __m128i b0 = __lsx_vbitsel_v(t1, t0, shuff1);
2121  __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
2122  const __m128i shuff_a = _v128_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29);
2123  const __m128i shuff_b = _v128_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30);
2124  const __m128i shuff_c = _v128_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31);
2125 
2126  a.val = __lsx_vshuf_b(t2, a0, shuff_a);
2127  b.val = __lsx_vshuf_b(t2, b0, shuff_b);
2128  c.val = __lsx_vshuf_b(t2, c0, shuff_c);
2129 }
2130 
2131 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2132 {
2133  __m128i t0 = __lsx_vld(ptr, 0);
2134  __m128i t1 = __lsx_vld(ptr, 16);
2135  __m128i t2 = __lsx_vld(ptr, 32);
2136  const __m128i shuff0 = _v128_setr_h(0, 0, -1, 0, 0, -1, 0, 0);
2137  const __m128i shuff1 = _v128_setr_h(0, -1, 0, 0, -1, 0, 0, -1);
2138 
2139  __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff1);
2140  __m128i b0 = __lsx_vbitsel_v(t0, t1, shuff0);
2141  __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
2142 
2143  const __m128i shuff_a = _v128_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 20, 21, 26, 27);
2144  const __m128i shuff_b = _v128_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 16, 17, 22, 23, 28, 29);
2145  const __m128i shuff_c = _v128_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31);
2146 
2147  a.val = __lsx_vshuf_b(t2, a0, shuff_a);
2148  b.val = __lsx_vshuf_b(t2, b0, shuff_b);
2149  c.val = __lsx_vshuf_b(t2, c0, shuff_c);
2150 }
2151 
2152 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2153 {
2154  __m128i t0 = __lsx_vld(ptr, 0);
2155  __m128i t1 = __lsx_vld(ptr, 16);
2156  __m128i t2 = __lsx_vld(ptr, 32);
2157 
2158  __m128i a0 = __lsx_vpermi_w(t1, t0, 0xAC);
2159  __m128i b0 = __lsx_vpermi_w(t1, t0, 0xC5);
2160  __m128i c0 = __lsx_vpermi_w(t1, t0, 0x5A);
2161 
2162  a.val = __lsx_vextrins_w(a0, t2, 0x31);
2163  b0 = __lsx_vshuf4i_w(b0, 0x38);
2164  c0 = __lsx_vshuf4i_w(c0, 0x8);
2165  b.val = __lsx_vextrins_w(b0, t2, 0x32);
2166  c.val = __lsx_vpermi_w(t2, c0, 0xC4);
2167 }
2168 
2169 inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2170 {
2171  __m128i t0 = __lsx_vld(ptr, 0);
2172  __m128i t1 = __lsx_vld(ptr, 16);
2173  __m128i t2 = __lsx_vld(ptr, 32);
2174 
2175  a.val = __lsx_vshuf4i_d(t0, t1, 0xC);
2176  b.val = __lsx_vshuf4i_d(t0, t2, 0x9);
2177  c.val = __lsx_vshuf4i_d(t1, t2, 0xC);
2178 }
2179 
2180 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2181 {
2182  __m128i t0 = __lsx_vld(ptr, 0);
2183  __m128i t1 = __lsx_vld(ptr, 16);
2184  __m128i t2 = __lsx_vld(ptr, 32);
2185  __m128i t3 = __lsx_vld(ptr, 48);
2186 
2187  __m128i ac_lo = __lsx_vpickev_b(t1, t0);
2188  __m128i bd_lo = __lsx_vpickod_b(t1, t0);
2189  __m128i ac_hi = __lsx_vpickev_b(t3, t2);
2190  __m128i bd_hi = __lsx_vpickod_b(t3, t2);
2191 
2192  a.val = __lsx_vpickev_b(ac_hi, ac_lo);
2193  c.val = __lsx_vpickod_b(ac_hi, ac_lo);
2194  b.val = __lsx_vpickev_b(bd_hi, bd_lo);
2195  d.val = __lsx_vpickod_b(bd_hi, bd_lo);
2196 }
2197 
2198 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2199 {
2200  __m128i t0 = __lsx_vld(ptr, 0);
2201  __m128i t1 = __lsx_vld(ptr, 16);
2202  __m128i t2 = __lsx_vld(ptr, 32);
2203  __m128i t3 = __lsx_vld(ptr, 48);
2204 
2205  __m128i ac_lo = __lsx_vpickev_h(t1, t0);
2206  __m128i bd_lo = __lsx_vpickod_h(t1, t0);
2207  __m128i ac_hi = __lsx_vpickev_h(t3, t2);
2208  __m128i bd_hi = __lsx_vpickod_h(t3, t2);
2209 
2210  a.val = __lsx_vpickev_h(ac_hi, ac_lo);
2211  c.val = __lsx_vpickod_h(ac_hi, ac_lo);
2212  b.val = __lsx_vpickev_h(bd_hi, bd_lo);
2213  d.val = __lsx_vpickod_h(bd_hi, bd_lo);
2214 }
2215 
2216 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2217 {
2218  __m128i p0 = __lsx_vld(ptr, 0);
2219  __m128i p1 = __lsx_vld(ptr, 16);
2220  __m128i p2 = __lsx_vld(ptr, 32);
2221  __m128i p3 = __lsx_vld(ptr, 48);
2222 
2223  __m128i t0 = __lsx_vilvl_w(p1, p0);
2224  __m128i t1 = __lsx_vilvl_w(p3, p2);
2225  __m128i t2 = __lsx_vilvh_w(p1, p0);
2226  __m128i t3 = __lsx_vilvh_w(p3, p2);
2227  a.val = __lsx_vilvl_d(t1, t0);
2228  b.val = __lsx_vilvh_d(t1, t0);
2229  c.val = __lsx_vilvl_d(t3, t2);
2230  d.val = __lsx_vilvh_d(t3, t2);
2231 }
2232 
2233 inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2234 {
2235  __m128i t0 = __lsx_vld(ptr, 0);
2236  __m128i t1 = __lsx_vld(ptr, 16);
2237  __m128i t2 = __lsx_vld(ptr, 32);
2238  __m128i t3 = __lsx_vld(ptr, 48);
2239 
2240  a.val = __lsx_vilvl_d(t2, t0);
2241  b.val = __lsx_vilvh_d(t2, t0);
2242  c.val = __lsx_vilvl_d(t3, t1);
2243  d.val = __lsx_vilvh_d(t3, t1);
2244 }
2245 
2247 
2248 inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2250 {
2251  __m128i v0 = __lsx_vilvl_b(b.val, a.val);
2252  __m128i v1 = __lsx_vilvh_b(b.val, a.val);
2253 
2254  __lsx_vst(v0, ptr, 0);
2255  __lsx_vst(v1, ptr, 16);
2256 }
2257 
2258 inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2260 {
2261  __m128i v0 = __lsx_vilvl_h(b.val, a.val);
2262  __m128i v1 = __lsx_vilvh_h(b.val, a.val);
2263 
2264  __lsx_vst(v0, ptr, 0);
2265  __lsx_vst(v1, ptr, 16);
2266 }
2267 
2268 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2270 {
2271  __m128i v0 = __lsx_vilvl_w(b.val, a.val);
2272  __m128i v1 = __lsx_vilvh_w(b.val, a.val);
2273 
2274  __lsx_vst(v0, ptr, 0);
2275  __lsx_vst(v1, ptr, 16);
2276 }
2277 
2278 inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b,
2280 {
2281  __m128i v0 = __lsx_vilvl_d(b.val, a.val);
2282  __m128i v1 = __lsx_vilvh_d(b.val, a.val);
2283 
2284  __lsx_vst(v0, ptr, 0);
2285  __lsx_vst(v1, ptr, 16);
2286 }
2287 
2288 inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c,
2290 {
2291  __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
2292  __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
2293  __m128i v_c = c.val;
2294  const __m128i shuff0 = _v128_setr_b(0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10);
2295  const __m128i shuff1 = _v128_setr_b(11, 21, 12, 13, 22, 14, 15, 23, 0, 0, 0, 0, 0, 0, 0, 0);
2296  const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 24, 18, 19, 25, 20, 21);
2297  const __m128i shuff3 = _v128_setr_b(26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31);
2298  __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
2299 
2300  __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
2301  __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
2302  __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
2303  dst1 = __lsx_vshuf_b(abc, dst1, shuff2);
2304 
2305  __lsx_vst(dst0, ptr, 0);
2306  __lsx_vst(dst1, ptr, 16);
2307  __lsx_vst(dst2, ptr, 32);
2308 }
2309 
2310 inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, const v_uint16x8& c,
2312 {
2313  __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
2314  __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
2315  __m128i v_c = c.val;
2316  const __m128i shuff0 = _v128_setr_b(0, 1, 2, 3, 16, 17, 4, 5, 6, 7, 18, 19, 8, 9, 10, 11);
2317  const __m128i shuff1 = _v128_setr_b(20, 21, 12, 13, 14, 15, 22, 23, 0, 0, 0, 0, 0, 0, 0, 0);
2318  const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 24, 25, 20, 21);
2319  const __m128i shuff3 = _v128_setr_b(6, 7, 26, 27, 8, 9, 10, 11, 28, 29, 12, 13, 14, 15, 30, 31);
2320  __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
2321 
2322  __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
2323  __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
2324  __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
2325  dst1 = __lsx_vshuf_b(abc, dst1, shuff2);
2326 
2327  __lsx_vst(dst0, ptr, 0);
2328  __lsx_vst(dst1, ptr, 16);
2329  __lsx_vst(dst2, ptr, 32);
2330 }
2331 
2332 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, const v_uint32x4& c,
2334 {
2335  __m128i v_c = c.val;
2336  __m128i ab_lo = __lsx_vilvl_w(b.val, a.val); //a0 b0 a1 b1
2337  __m128i ab_hi = __lsx_vilvh_w(b.val, a.val); //a2 b2 a3 b3
2338  __m128i bc_od = __lsx_vpackod_w(v_c, b.val); // b1 c1 b3 c3
2339 
2340  __m128i dst0 = __lsx_vshuf4i_w(ab_lo, 0xB4); //a0 b0 b1 a1
2341  __m128i dst1 = __lsx_vilvl_d(ab_hi, bc_od); //b1 c1 a2 b2
2342  __m128i dst2 = __lsx_vpermi_w(bc_od, ab_hi, 0xE8); //a2, a3, b3, c3
2343 
2344  dst0 = __lsx_vextrins_w(dst0, v_c, 0x20);
2345  dst2 = __lsx_vextrins_w(dst2, v_c, 0x2);
2346  __lsx_vst(dst0, ptr, 0); //a0 b0 c0 a1
2347  __lsx_vst(dst1, ptr, 16); //b1 c1 a2 b2
2348  __lsx_vst(dst2, ptr, 32); //c2 a3 b3 c3
2349 }
2350 
2351 inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
2353 {
2354  __m128i dst0 = __lsx_vilvl_d(b.val, a.val);
2355  __m128i dst1 = __lsx_vpermi_w(a.val, c.val, 0xE4);
2356  __m128i dst2 = __lsx_vilvh_d(c.val, b.val);
2357 
2358  __lsx_vst(dst0, ptr, 0);
2359  __lsx_vst(dst1, ptr, 16);
2360  __lsx_vst(dst2, ptr, 32);
2361 }
2362 
2363 inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2364  const v_uint8x16& c, const v_uint8x16& d,
2366 {
2367  __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
2368  __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
2369  __m128i cd_lo = __lsx_vilvl_b(d.val, c.val);
2370  __m128i cd_hi = __lsx_vilvh_b(d.val, c.val);
2371 
2372  __m128i dst0 = __lsx_vilvl_h(cd_lo, ab_lo);
2373  __m128i dst1 = __lsx_vilvh_h(cd_lo, ab_lo);
2374  __m128i dst2 = __lsx_vilvl_h(cd_hi, ab_hi);
2375  __m128i dst3 = __lsx_vilvh_h(cd_hi, ab_hi);
2376 
2377  __lsx_vst(dst0, ptr, 0);
2378  __lsx_vst(dst1, ptr, 16);
2379  __lsx_vst(dst2, ptr, 32);
2380  __lsx_vst(dst3, ptr, 48);
2381 }
2382 
2383 inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2384  const v_uint16x8& c, const v_uint16x8& d,
2386 {
2387  __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
2388  __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
2389  __m128i cd_lo = __lsx_vilvl_h(d.val, c.val);
2390  __m128i cd_hi = __lsx_vilvh_h(d.val, c.val);
2391 
2392  __m128i dst0 = __lsx_vilvl_w(cd_lo, ab_lo);
2393  __m128i dst1 = __lsx_vilvh_w(cd_lo, ab_lo);
2394  __m128i dst2 = __lsx_vilvl_w(cd_hi, ab_hi);
2395  __m128i dst3 = __lsx_vilvh_w(cd_hi, ab_hi);
2396 
2397  __lsx_vst(dst0, ptr, 0);
2398  __lsx_vst(dst1, ptr, 16);
2399  __lsx_vst(dst2, ptr, 32);
2400  __lsx_vst(dst3, ptr, 48);
2401 }
2402 
2403 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2404  const v_uint32x4& c, const v_uint32x4& d,
2406 {
2407  __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);
2408  __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);
2409  __m128i cd_lo = __lsx_vilvl_w(d.val, c.val);
2410  __m128i cd_hi = __lsx_vilvh_w(d.val, c.val);
2411 
2412  __m128i dst0 = __lsx_vilvl_d(cd_lo, ab_lo);
2413  __m128i dst1 = __lsx_vilvh_d(cd_lo, ab_lo);
2414  __m128i dst2 = __lsx_vilvl_d(cd_hi, ab_hi);
2415  __m128i dst3 = __lsx_vilvh_d(cd_hi, ab_hi);
2416 
2417  __lsx_vst(dst0, ptr, 0);
2418  __lsx_vst(dst1, ptr, 16);
2419  __lsx_vst(dst2, ptr, 32);
2420  __lsx_vst(dst3, ptr, 48);
2421 }
2422 
2423 inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b,
2424  const v_uint64x2& c, const v_uint64x2& d,
2426 {
2427  __m128i dst0 = __lsx_vilvl_d(b.val, a.val);
2428  __m128i dst2 = __lsx_vilvh_d(b.val, a.val);
2429  __m128i dst1 = __lsx_vilvl_d(d.val, c.val);
2430  __m128i dst3 = __lsx_vilvh_d(d.val, c.val);
2431 
2432  __lsx_vst(dst0, ptr, 0);
2433  __lsx_vst(dst1, ptr, 16);
2434  __lsx_vst(dst2, ptr, 32);
2435  __lsx_vst(dst3, ptr, 48);
2436 }
2437 
2438 #define OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2439 inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0) \
2440 { \
2441  _Tpvec1 a1, b1; \
2442  v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2443  a0 = v_reinterpret_as_##suffix0(a1); \
2444  b0 = v_reinterpret_as_##suffix0(b1); \
2445 } \
2446 inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0) \
2447 { \
2448  _Tpvec1 a1, b1, c1; \
2449  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2450  a0 = v_reinterpret_as_##suffix0(a1); \
2451  b0 = v_reinterpret_as_##suffix0(b1); \
2452  c0 = v_reinterpret_as_##suffix0(c1); \
2453 } \
2454 inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, \
2455  _Tpvec0& c0, _Tpvec0& d0) \
2456 { \
2457  _Tpvec1 a1, b1, c1, d1; \
2458  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2459  a0 = v_reinterpret_as_##suffix0(a1); \
2460  b0 = v_reinterpret_as_##suffix0(b1); \
2461  c0 = v_reinterpret_as_##suffix0(c1); \
2462  d0 = v_reinterpret_as_##suffix0(d1); \
2463 } \
2464 inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2465  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2466 { \
2467  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2468  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2469  v_store_interleave((_Tp1*)ptr, a1, b1); \
2470 } \
2471 inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0,\
2472  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2473 { \
2474  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2475  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2476  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2477  v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
2478 } \
2479 inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2480  const _Tpvec0& c0, const _Tpvec0& d0, \
2481  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2482 { \
2483  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2484  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2485  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2486  _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2487  v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
2488 }
2489 
2490 OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2491 OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2492 OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2493 OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
2494 OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2495 OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2496 
2497 //
2498 // FP16
2499 //
2500 
2501 inline v_float32x4 v_load_expand(const hfloat* ptr)
2502 {
2503 #if CV_FP16
2504  return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0)));
2505 #else
2506  float CV_DECL_ALIGNED(32) buf[4];
2507  for (int i = 0; i < 4; i++)
2508  buf[i] = (float)ptr[i];
2509  return v_float32x4((__m128)__lsx_vld(buf, 0));
2510 #endif
2511 }
2512 
2513 inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
2514 {
2515 #if CV_FP16
2516  __m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val);
2517  __lsx_vstelm_d(res, ptr, 0, 0);
2518 #else
2519  float CV_DECL_ALIGNED(32) buf[4];
2520  v_store_aligned(buf, a);
2521  for (int i = 0; i < 4; i++)
2522  ptr[i] = hfloat(buf[i]);
2523 #endif
2524 }
2525 
2526 //
2527 // end of FP16
2528 //
2529 
2530 inline void v_cleanup() {}
2531 
2532 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2533 
2535 
2536 } // cv::
2537 
2538 #endif // OPENCV_HAL_INTRIN_LSX_HPP
const int * idx
Definition: core_c.h:668
CvArr * dst0
Definition: core_c.h:988
const CvArr * vec2
Definition: core_c.h:1429
CvArr CvArr CvArr CvArr * dst3
Definition: core_c.h:989
CvArr CvArr * dst1
Definition: core_c.h:988
const CvArr CvArr * x
Definition: core_c.h:1195
CvArr CvArr CvArr * dst2
Definition: core_c.h:989
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2216
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition: intrin_cpp.hpp:2761
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1451
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
StoreMode
Definition: intrin.hpp:100
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition: dualquaternion.inl.hpp:274
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437