EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_lsx.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html
4
5#ifndef OPENCV_HAL_INTRIN_LSX_HPP
6#define OPENCV_HAL_INTRIN_LSX_HPP
7
8#include <lsxintrin.h>
9
10#define CV_SIMD128 1
11#define CV_SIMD128_64F 1
12#define CV_SIMD128_FP16 0
13
14namespace cv
15{
16
18
19CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
20
22
23inline __m128i _v128_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6,
24 char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15)
25{
26 return (__m128i)v16i8{ v0, v1, v2, v3, v4, v5, v6, v7,
27 v8, v9, v10, v11, v12, v13, v14, v15 };
28}
29
30inline __m128i _v128_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6,
31 char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15)
32{
33 return (__m128i)v16i8{ v15, v14, v13, v12, v11, v10, v9, v8,
34 v7, v6, v5, v4, v3, v2, v1, v0 };
35}
36
37inline __m128i _v128_setr_h(short v0, short v1, short v2, short v3, short v4, short v5,
38 short v6, short v7)
39{
40 return (__m128i)v8i16{ v0, v1, v2, v3, v4, v5, v6, v7 };
41}
42
43inline __m128i _v128_setr_w(int v0, int v1, int v2, int v3)
44{
45 return (__m128i)v4i32{ v0, v1, v2, v3 };
46}
47
48inline __m128i _v128_set_w(int v0, int v1, int v2, int v3)
49{
50 return (__m128i)v4i32{ v3, v2, v1, v0 };
51}
52
53inline __m128i _v128_setall_w(int v0)
54{
55 return __lsx_vreplgr2vr_w(v0);
56}
57
58inline __m128i _v128_setr_d(int64 v0, int64 v1)
59{
60 return (__m128i)v2i64{ v0, v1 };
61}
62
63inline __m128i _v128_set_d(int64 v0, int64 v1)
64{
65 return (__m128i)v2i64{ v1, v0 };
66}
67
68inline __m128 _v128_setr_ps(float v0, float v1, float v2, float v3)
69{
70 return (__m128)v4f32{ v0, v1, v2, v3 };
71}
72
73inline __m128 _v128_setall_ps(float v0)
74{
75 return (__m128)v4f32{ v0, v0, v0, v0 };
76}
77
78inline __m128d _v128_setr_pd(double v0, double v1)
79{
80 return (__m128d)v2f64{ v0, v1 };
81}
82
83inline __m128d _v128_setall_pd(double v0)
84{
85 return (__m128d)v2f64{ v0, v0 };
86}
87
88inline __m128i _lsx_packus_h(const __m128i& a, const __m128i& b)
89{
90 return __lsx_vssrarni_bu_h(b, a, 0);
91}
92
93inline __m128i _lsx_packs_h(const __m128i& a, const __m128i& b)
94{
95 return __lsx_vssrarni_b_h(b, a, 0);
96}
97
98inline __m128i _lsx_packus_w(const __m128i& a, const __m128i& b)
99{
100 return __lsx_vssrarni_hu_w(b, a, 0);
101}
102
104
105struct v_uint8x16
106{
107 typedef uchar lane_type;
108 enum { nlanes = 16};
109
110 v_uint8x16() {}
111 explicit v_uint8x16(__m128i v): val(v) {}
112 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
113 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
114 {
115 val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
116 }
117
118 uchar get0() const
119 {
120 return (uchar)__lsx_vpickve2gr_bu(val, 0);
121 }
122
123 __m128i val;
124};
125
126struct v_int8x16
127{
128 typedef schar lane_type;
129 enum { nlanes = 16 };
130
131 v_int8x16() {}
132 explicit v_int8x16(__m128i v) : val(v) {}
133 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
134 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
135 {
136 val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
137 }
138
139 schar get0() const
140 {
141 return (schar)__lsx_vpickve2gr_b(val, 0);
142 }
143
144 __m128i val;
145};
146
147struct v_uint16x8
148{
149 typedef ushort lane_type;
150 enum { nlanes = 8 };
151
152 v_uint16x8() {}
153 explicit v_uint16x8(__m128i v) : val(v) {}
154 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
155 {
156 val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
157 }
158
159 ushort get0() const
160 {
161 return (ushort)__lsx_vpickve2gr_hu(val, 0);
162 }
163
164 __m128i val;
165};
166
167struct v_int16x8
168{
169 typedef short lane_type;
170 enum { nlanes = 8 };
171
172 v_int16x8() {}
173 explicit v_int16x8(__m128i v) : val(v) {}
174 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
175 {
176 val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
177 }
178
179 short get0() const
180 {
181 return (short)__lsx_vpickve2gr_h(val, 0);
182 }
183
184 __m128i val;
185};
186
187struct v_uint32x4
188{
189 typedef unsigned lane_type;
190 enum { nlanes = 4 };
191
192 v_uint32x4() {}
193 explicit v_uint32x4(__m128i v) : val(v) {}
194 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
195 {
196 val = _v128_setr_w(v0, v1, v2, v3);
197 }
198
199 unsigned get0() const
200 {
201 return (unsigned)__lsx_vpickve2gr_wu(val, 0);
202 }
203
204 __m128i val;
205};
206
207struct v_int32x4
208{
209 typedef int lane_type;
210 enum { nlanes = 4 };
211
212 v_int32x4() {}
213 explicit v_int32x4(__m128i v) : val(v) {}
214 v_int32x4(int v0, int v1, int v2, int v3)
215 {
216 val = _v128_setr_w(v0, v1, v2, v3);
217 }
218
219 int get0() const
220 {
221 return (int)__lsx_vpickve2gr_w(val, 0);
222 }
223
224 __m128i val;
225};
226
227struct v_float32x4
228{
229 typedef float lane_type;
230 enum { nlanes = 4};
231
232 v_float32x4() {}
233 explicit v_float32x4(__m128 v) : val(v) {}
234 explicit v_float32x4(__m128i v) { val = *((__m128*)&v); }
235 v_float32x4(float v0, float v1, float v2, float v3)
236 {
237 val = _v128_setr_ps(v0, v1, v2, v3);
238 }
239
240 float get0() const
241 {
242 union { int iv; float fv; } d;
243 d.iv = __lsx_vpickve2gr_w(val, 0);
244 return d.fv;
245 }
246
247 int get0toint() const
248 {
249 __m128i result = __lsx_vftintrz_w_s(val);
250 return (int)__lsx_vpickve2gr_w(result, 0);
251 }
252
253 __m128 val;
254};
255
256struct v_uint64x2
257{
258 typedef uint64 lane_type;
259 enum { nlanes = 2};
260
261 v_uint64x2() {}
262 explicit v_uint64x2(__m128i v) : val(v) {}
263 v_uint64x2(uint64 v0, uint64 v1)
264 {
265 val = _v128_setr_d(v0, v1);
266 }
267
268 uint64 get0() const
269 {
270 return __lsx_vpickve2gr_du(val, 0);
271 }
272
273 __m128i val;
274};
275
276struct v_int64x2
277{
278 typedef int64 lane_type;
279 enum { nlanes = 2};
280
281 v_int64x2() {}
282 explicit v_int64x2(__m128i v) : val(v) {}
283 v_int64x2(int64 v0, int64 v1)
284 {
285 val = _v128_setr_d(v0, v1);
286 }
287
288 uint64 get0() const
289 {
290 return __lsx_vpickve2gr_d(val, 0);
291 }
292
293 __m128i val;
294};
295
296struct v_float64x2
297{
298 typedef double lane_type;
299 enum { nlanes = 2};
300
301 v_float64x2() {}
302 explicit v_float64x2(__m128d v) : val(v) {}
303 explicit v_float64x2(__m128i v) { val = *((__m128d*)&v); }
304 v_float64x2(double v0, double v1)
305 {
306 val = _v128_setr_pd(v0, v1);
307 }
308
309 double get0() const
310 {
311 union { int64 iv; double fv; } d;
312 d.iv = __lsx_vpickve2gr_d(val, 0);
313 return d.fv;
314 }
315
316 int64 get0toint64() const
317 {
318 __m128i result = __lsx_vftintrz_l_d(val);
319 return (int64)__lsx_vpickve2gr_d(result, 0);
320 }
321
322 __m128d val;
323};
324
326
327#define OPENCV_HAL_IMPL_LSX_LOADSTORE(_Tpvec, _Tp) \
328 inline _Tpvec v_load(const _Tp* ptr) \
329 { return _Tpvec(__lsx_vld(ptr, 0)); } \
330 inline _Tpvec v_load_aligned(const _Tp* ptr) \
331 { return _Tpvec(__lsx_vld(ptr, 0)); } \
332 inline _Tpvec v_load_low(const _Tp* ptr) \
333 { return _Tpvec(__lsx_vldrepl_d(ptr, 0)); } \
334 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
335 { \
336 __m128i vl = __lsx_vldrepl_d(ptr0, 0); \
337 __m128i vh = __lsx_vldrepl_d(ptr1, 0); \
338 return _Tpvec(__lsx_vilvl_d(vh, vl)); \
339 } \
340 inline void v_store(_Tp* ptr, const _Tpvec& a) \
341 { __lsx_vst(a.val, ptr, 0); } \
342 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
343 { __lsx_vst(a.val, ptr, 0); } \
344 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
345 { __lsx_vst(a.val, ptr, 0); } \
346 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
347 { \
348 if ( mode == hal::STORE_UNALIGNED) \
349 __lsx_vst(a.val, ptr, 0); \
350 else if ( mode == hal::STORE_ALIGNED_NOCACHE) \
351 __lsx_vst(a.val, ptr, 0); \
352 else \
353 __lsx_vst(a.val, ptr, 0); \
354 } \
355 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
356 { __lsx_vstelm_d(a.val, ptr, 0, 0); } \
357 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
358 { __lsx_vstelm_d(a.val, ptr, 0, 1); } \
359
360OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint8x16, uchar)
361OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int8x16, schar)
362OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint16x8, ushort)
363OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int16x8, short)
364OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint32x4, unsigned)
365OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int32x4, int)
366OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint64x2, uint64)
367OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int64x2, int64)
368
369#define OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg) \
370 inline _Tpvec v_load(const _Tp* ptr) \
371 { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); } \
372 inline _Tpvec v_load_aligned(const _Tp* ptr) \
373 { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); } \
374 inline _Tpvec v_load_low(const _Tp* ptr) \
375 { return _Tpvec((halfreg)__lsx_vldrepl_d(ptr, 0)); } \
376 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
377 { \
378 __m128i vl = __lsx_vldrepl_d(ptr0, 0); \
379 __m128i vh = __lsx_vldrepl_d(ptr1, 0); \
380 return _Tpvec((halfreg)__lsx_vilvl_d(vh, vl)); \
381 } \
382 inline void v_store(_Tp* ptr, const _Tpvec& a) \
383 { __lsx_vst((__m128i)a.val, ptr, 0); } \
384 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
385 { __lsx_vst((__m128i)a.val, ptr, 0); } \
386 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
387 { __lsx_vst((__m128i)a.val, ptr, 0); } \
388 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
389 { \
390 if( mode == hal::STORE_UNALIGNED) \
391 __lsx_vst((__m128i)a.val, ptr, 0); \
392 else if( mode == hal::STORE_ALIGNED_NOCACHE) \
393 __lsx_vst((__m128i)a.val, ptr, 0); \
394 else \
395 __lsx_vst((__m128i)a.val, ptr, 0); \
396 } \
397 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
398 { __lsx_vstelm_d((__m128i)a.val, ptr, 0, 0); } \
399 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
400 { __lsx_vstelm_d((__m128i)a.val, ptr, 0, 1); } \
401
402OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float32x4, float, __m128)
403OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float64x2, double, __m128d)
404
405inline __m128i _lsx_128_castps_si128(const __m128& v)
406{ return __m128i(v); }
407
408inline __m128i _lsx_128_castpd_si128(const __m128d& v)
409{ return __m128i(v); }
410
411#define OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
412 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
413 { return _Tpvec(cast(a.val)); }
414
415#define OPENCV_HAL_IMPL_LSX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
416 inline _Tpvec v_setzero_##suffix() \
417 { return _Tpvec(__lsx_vldi(0)); } \
418 inline _Tpvec v_setall_##suffix(_Tp v) \
419 { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); } \
420 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, OPENCV_HAL_NOP) \
421 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, OPENCV_HAL_NOP) \
422 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, OPENCV_HAL_NOP) \
423 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8, suffix, OPENCV_HAL_NOP) \
424 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4, suffix, OPENCV_HAL_NOP) \
425 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4, suffix, OPENCV_HAL_NOP) \
426 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2, suffix, OPENCV_HAL_NOP) \
427 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2, suffix, OPENCV_HAL_NOP) \
428 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float32x4, suffix, _lsx_128_castps_si128) \
429 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float64x2, suffix, _lsx_128_castpd_si128) \
430
431OPENCV_HAL_IMPL_LSX_INIT(v_uint8x16, uchar, u8, b, int)
432OPENCV_HAL_IMPL_LSX_INIT(v_int8x16, schar, s8, b, int)
433OPENCV_HAL_IMPL_LSX_INIT(v_uint16x8, ushort, u16, h, int)
434OPENCV_HAL_IMPL_LSX_INIT(v_int16x8, short, s16, h, int)
435OPENCV_HAL_IMPL_LSX_INIT(v_uint32x4, unsigned, u32, w, int)
436OPENCV_HAL_IMPL_LSX_INIT(v_int32x4, int, s32, w, int)
437OPENCV_HAL_IMPL_LSX_INIT(v_uint64x2, uint64, u64, d, long int)
438OPENCV_HAL_IMPL_LSX_INIT(v_int64x2, int64, s64, d, long int)
439
440inline __m128 _lsx_128_castsi128_ps(const __m128i &v)
441{ return __m128(v); }
442
443inline __m128d _lsx_128_castsi128_pd(const __m128i &v)
444{ return __m128d(v); }
445
446#define OPENCV_HAL_IMPL_LSX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
447 inline _Tpvec v_setzero_##suffix() \
448 { return _Tpvec(__lsx_vldi(0)); } \
449 inline _Tpvec v_setall_##suffix(_Tp v) \
450 { return _Tpvec(_v128_setall_##zsuffix(v)); } \
451 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, cast) \
452 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, cast) \
453 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, cast) \
454 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8, suffix, cast) \
455 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4, suffix, cast) \
456 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4, suffix, cast) \
457 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2, suffix, cast) \
458 OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2, suffix, cast) \
459
460OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float32x4, float, f32, ps, _lsx_128_castsi128_ps)
461OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float64x2, double, f64, pd, _lsx_128_castsi128_pd)
462
463inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a)
464{ return a; }
465inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a)
466{ return v_float32x4(_lsx_128_castps_si128(__m128(a.val))); }
467
468inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a)
469{ return a; }
470inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a)
471{ return v_float64x2(_lsx_128_castpd_si128(__m128d(a.val))); }
472
474
475// unpacks
476#define OPENCV_HAL_IMPL_LSX_UNPACK(_Tpvec, suffix) \
477 inline _Tpvec v128_unpacklo(const _Tpvec& a, const _Tpvec& b) \
478 { return _Tpvec(__lsx_vilvl_##suffix(__m128i(b.val), __m128i(a.val))); } \
479 inline _Tpvec v128_unpackhi(const _Tpvec& a, const _Tpvec& b) \
480 { return _Tpvec(__lsx_vilvh_##suffix(__m128i(b.val), __m128i(a.val))); } \
481
482OPENCV_HAL_IMPL_LSX_UNPACK(v_uint8x16, b)
483OPENCV_HAL_IMPL_LSX_UNPACK(v_int8x16, b)
484OPENCV_HAL_IMPL_LSX_UNPACK(v_uint16x8, h)
485OPENCV_HAL_IMPL_LSX_UNPACK(v_int16x8, h)
486OPENCV_HAL_IMPL_LSX_UNPACK(v_uint32x4, w)
487OPENCV_HAL_IMPL_LSX_UNPACK(v_int32x4, w)
488OPENCV_HAL_IMPL_LSX_UNPACK(v_uint64x2, d)
489OPENCV_HAL_IMPL_LSX_UNPACK(v_int64x2, d)
490OPENCV_HAL_IMPL_LSX_UNPACK(v_float32x4, w)
491OPENCV_HAL_IMPL_LSX_UNPACK(v_float64x2, d)
492
493//ZIP
494#define OPENCV_HAL_IMPL_LSX_ZIP(_Tpvec) \
495 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
496 { return (_Tpvec)__lsx_vilvl_d((__m128i)b.val, (__m128i)a.val); } \
497 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
498 { return (_Tpvec)__lsx_vilvh_d((__m128i)b.val, (__m128i)a.val); } \
499 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
500 _Tpvec& c, _Tpvec& d) \
501 { \
502 __m128i a1 = (__m128i)a.val, b1 = (__m128i)b.val; \
503 c = _Tpvec(__lsx_vilvl_d(b1, a1)); \
504 d = _Tpvec(__lsx_vilvh_d(b1, a1)); \
505 } \
506 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
507 _Tpvec& ab0, _Tpvec& ab1) \
508 { \
509 ab0 = v128_unpacklo(a, b); \
510 ab1 = v128_unpackhi(a, b); \
511 }
512
513OPENCV_HAL_IMPL_LSX_ZIP(v_uint8x16)
514OPENCV_HAL_IMPL_LSX_ZIP(v_int8x16)
515OPENCV_HAL_IMPL_LSX_ZIP(v_uint16x8)
516OPENCV_HAL_IMPL_LSX_ZIP(v_int16x8)
517OPENCV_HAL_IMPL_LSX_ZIP(v_uint32x4)
518OPENCV_HAL_IMPL_LSX_ZIP(v_int32x4)
519OPENCV_HAL_IMPL_LSX_ZIP(v_uint64x2)
520OPENCV_HAL_IMPL_LSX_ZIP(v_int64x2)
521OPENCV_HAL_IMPL_LSX_ZIP(v_float32x4)
522OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2)
523
524
525
526
527#define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin) \
528 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
529 { return _Tpvec(intrin(a.val, b.val)); } \
530 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
531 { a.val = intrin(a.val, b.val); return a; }
532
533OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16, __lsx_vsadd_bu)
534OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16, __lsx_vssub_bu)
535OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16, __lsx_vsadd_b)
536OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16, __lsx_vssub_b)
537OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8, __lsx_vsadd_hu)
538OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8, __lsx_vssub_hu)
539OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8, __lsx_vsadd_h)
540OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8, __lsx_vssub_h)
541OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4, __lsx_vadd_w)
542OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4, __lsx_vsub_w)
543OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4, __lsx_vmul_w)
544OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4, __lsx_vadd_w)
545OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4, __lsx_vsub_w)
546OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4, __lsx_vmul_w)
547OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2, __lsx_vadd_d)
548OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2, __lsx_vsub_d)
549OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2, __lsx_vadd_d)
550OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2, __lsx_vsub_d)
551
552OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s)
553OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s)
554OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s)
555OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s)
556OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d)
557OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d)
558OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d)
559OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d)
560
561// saturating multiply 8-bit, 16-bit
562inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
563{
564 v_uint16x8 c, d;
565 v_mul_expand(a, b, c, d);
566 return v_pack(c, d);
567}
568inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b)
569{
570 v_int16x8 c, d;
571 v_mul_expand(a, b, c, d);
572 return v_pack(c, d);
573}
574inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
575{
576 __m128i a0 = a.val, b0 = b.val;
577 __m128i pev = __lsx_vmulwev_w_hu(a0, b0);
578 __m128i pod = __lsx_vmulwod_w_hu(a0, b0);
579 __m128i pl = __lsx_vilvl_w(pod, pev);
580 __m128i ph = __lsx_vilvh_w(pod, pev);
581 return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
582}
583inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
584{
585 __m128i a0 = a.val, b0 = b.val;
586 __m128i pev = __lsx_vmulwev_w_h(a0, b0);
587 __m128i pod = __lsx_vmulwod_w_h(a0, b0);
588 __m128i pl = __lsx_vilvl_w(pod, pev);
589 __m128i ph = __lsx_vilvh_w(pod, pev);
590 return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
591}
592inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
593{ a = a * b; return a; }
594inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b)
595{ a = a * b; return a; }
596inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b)
597{ a = a * b; return a; }
598inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b)
599{ a = a * b; return a; }
600
603#define OPENCV_HAL_IMPL_LSX_BIN_FUNC(func, _Tpvec, intrin) \
604 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
605 { return _Tpvec(intrin(a.val, b.val)); } \
606
607OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint8x16, __lsx_vadd_b)
608OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int8x16, __lsx_vadd_b)
609OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint16x8, __lsx_vadd_h)
610OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int16x8, __lsx_vadd_h)
611OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint8x16, __lsx_vsub_b)
612OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int8x16, __lsx_vsub_b)
613OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint16x8, __lsx_vsub_h)
614OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int16x8, __lsx_vsub_h)
615OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_uint16x8, __lsx_vmul_h)
616OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_int16x8, __lsx_vmul_h)
617
618inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
619{
620 __m128i a0 = a.val, b0 = b.val;
621 __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
622 __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
623 return v_uint8x16(__lsx_vpackev_b(p1, p0));
624}
625
626inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
627{
628 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
629}
630
631// Multiply and expand
632inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
633 v_uint16x8& c, v_uint16x8& d)
634{
635 __m128i a0 = a.val, b0 = b.val;
636 __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
637 __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
638 c.val = __lsx_vilvl_h(p1, p0);
639 d.val = __lsx_vilvh_h(p1, p0);
640}
641inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
642 v_int16x8& c, v_int16x8& d)
643{
644 __m128i a0 = a.val, b0 = b.val;
645 __m128i p0 = __lsx_vmulwev_h_b(a0, b0);
646 __m128i p1 = __lsx_vmulwod_h_b(a0, b0);
647 c.val = __lsx_vilvl_h(p1, p0);
648 d.val = __lsx_vilvh_h(p1, p0);
649}
650inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
651 v_int32x4& c, v_int32x4& d)
652{
653 __m128i a0 = a.val, b0 = b.val;
654 __m128i p0 = __lsx_vmulwev_w_h(a0, b0);
655 __m128i p1 = __lsx_vmulwod_w_h(a0, b0);
656 c.val = __lsx_vilvl_w(p1, p0);
657 d.val = __lsx_vilvh_w(p1, p0);
658}
659inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
660 v_uint32x4& c, v_uint32x4& d)
661{
662 __m128i a0 = a.val, b0 = b.val;
663 __m128i p0 = __lsx_vmulwev_w_hu(a0, b0);
664 __m128i p1 = __lsx_vmulwod_w_hu(a0, b0);
665 c.val = __lsx_vilvl_w(p1, p0);
666 d.val = __lsx_vilvh_w(p1, p0);
667}
668inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
669 v_uint64x2& c, v_uint64x2& d)
670{
671 __m128i a0 = a.val, b0 = b.val;
672 __m128i p0 = __lsx_vmulwev_d_wu(a0, b0);
673 __m128i p1 = __lsx_vmulwod_d_wu(a0, b0);
674 c.val = __lsx_vilvl_d(p1, p0);
675 d.val = __lsx_vilvh_d(p1, p0);
676}
677inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
678{ return v_int16x8(__lsx_vmuh_h(a.val, b.val)); }
679inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
680{ return v_uint16x8(__lsx_vmuh_hu(a.val, b.val)); }
681
683#define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
684 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
685 { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
686 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
687 { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
688 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
689 { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
690 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
691 { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); } \
692 template<int imm> \
693 inline _Tpuvec v_shl(const _Tpuvec& a) \
694 { return _Tpuvec(__lsx_vslli_##suffix(a.val, imm)); } \
695 template<int imm> \
696 inline _Tpsvec v_shl(const _Tpsvec& a) \
697 { return _Tpsvec(__lsx_vslli_##suffix(a.val, imm)); } \
698 template<int imm> \
699 inline _Tpuvec v_shr(const _Tpuvec& a) \
700 { return _Tpuvec(__lsx_vsrli_##suffix(a.val, imm)); } \
701 template<int imm> \
702 inline _Tpsvec v_shr(const _Tpsvec& a) \
703 { return _Tpsvec(__lsx_vsrai_##suffix(a.val, imm)); } \
704
705OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint16x8, v_int16x8, h, __lsx_vsra_h)
706OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint32x4, v_int32x4, w, __lsx_vsra_w)
707OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d)
708
709
710#define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix) \
711 OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix) \
712 OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix) \
713 OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix) \
714 inline _Tpvec operator ~(const _Tpvec& a) \
715 { return _Tpvec(__lsx_vnori_b(a.val, 0)); } \
716
717OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16, v)
718OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int8x16, v)
719OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint16x8, v)
720OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int16x8, v)
721OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint32x4, v)
722OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int32x4, v)
723OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2, v)
724OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2, v)
725
726#define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast) \
727 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
728 { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); } \
729 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
730 { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val); \
731 a.val = cast(c); \
732 return a;}
733
734#define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast) \
735 OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast) \
736 OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast) \
737 OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast) \
738 inline _Tpvec operator ~ (const _Tpvec& a) \
739 { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); } \
740
741OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps)
742OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float64x2, _lsx_128_castsi128_pd)
743
744
745#define OPENCV_HAL_IMPL_LSX_SELECT(_Tpvec) \
746 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
747 { return _Tpvec(__lsx_vbitsel_v(b.val, a.val, mask.val)); } \
748
749OPENCV_HAL_IMPL_LSX_SELECT(v_uint8x16)
750OPENCV_HAL_IMPL_LSX_SELECT(v_int8x16)
751OPENCV_HAL_IMPL_LSX_SELECT(v_uint16x8)
752OPENCV_HAL_IMPL_LSX_SELECT(v_int16x8)
753OPENCV_HAL_IMPL_LSX_SELECT(v_uint32x4)
754OPENCV_HAL_IMPL_LSX_SELECT(v_int32x4)
755
756inline v_float32x4 v_select(const v_float32x4 &mask, const v_float32x4 &a, const v_float32x4 &b)
757{ return v_float32x4(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); }
758inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const v_float64x2 &b)
759{ return v_float64x2(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); }
760
762#define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec) \
763 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
764 { return ~( a == b ); } \
765 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
766 { return b > a ; } \
767 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
768 { return ~(a < b); } \
769 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
770 { return b >= a; } \
771
772#define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix) \
773 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
774 { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); } \
775 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
776 { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); } \
777 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
778 { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); } \
779 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
780 { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); } \
781 OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec) \
782 OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
783
784OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint8x16, v_int8x16, b, bu)
785OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8, v_int16x8, h, hu)
786OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4, v_int32x4, w, wu)
787
788#define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix) \
789 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
790 { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); } \
791 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
792 { return ~(a == b); }
793
794OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d)
795OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d)
796
797#define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix) \
798 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
799 { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); } \
800
801#define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix) \
802 OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix) \
803 OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix) \
804 OPENCV_HAL_IMPL_LSX_CMP_FLT(<, vfcmp_clt, _Tpvec, ssuffix) \
805 OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix) \
806
807OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s)
808OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d)
809
810inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b)
811{ return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
812
813inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b)
814{ return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
815
816inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b)
817{ return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
818
819inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b)
820{ return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
821
822inline v_float32x4 v_not_nan(const v_float32x4& a)
823{ return v_float32x4(__lsx_vfcmp_cor_s(a.val, a.val)); }
824
825inline v_float64x2 v_not_nan(const v_float64x2& a)
826{ return v_float64x2(__lsx_vfcmp_cor_d(a.val, a.val)); }
827
829OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint8x16, __lsx_vmin_bu)
830OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint8x16, __lsx_vmax_bu)
831OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int8x16, __lsx_vmin_b)
832OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int8x16, __lsx_vmax_b)
833OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint16x8, __lsx_vmin_hu)
834OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint16x8, __lsx_vmax_hu)
835OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int16x8, __lsx_vmin_h)
836OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int16x8, __lsx_vmax_h)
837OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint32x4, __lsx_vmin_wu)
838OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint32x4, __lsx_vmax_wu)
839OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int32x4, __lsx_vmin_w)
840OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int32x4, __lsx_vmax_w)
841OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float32x4, __lsx_vfmin_s)
842OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float32x4, __lsx_vfmax_s)
843OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float64x2, __lsx_vfmin_d)
844OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float64x2, __lsx_vfmax_d)
845
846template <int imm,
847 bool is_invalid = ((imm < 0) || (imm > 16)),
848 bool is_first = (imm == 0),
849 bool is_half = (imm == 8),
850 bool is_second = (imm == 16),
851 bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
852class v_lsx_palignr_u8_class;
853
854template <int imm>
855class v_lsx_palignr_u8_class<imm, true, false, false, false, false>;
856
857template <int imm>
858class v_lsx_palignr_u8_class<imm, false, true, false, false, false>
859{
860public:
861 inline __m128i operator()(const __m128i& a, const __m128i& b) const
862 {
863 CV_UNUSED(b);
864 return a;
865 }
866};
867
868template <int imm>
869class v_lsx_palignr_u8_class<imm, false, false, true, false, false>
870{
871public:
872 inline __m128i operator()(const __m128i& a, const __m128i& b) const
873 {
874 return __lsx_vshuf4i_d(a, b, 0x9);
875 }
876};
877
878template <int imm>
879class v_lsx_palignr_u8_class<imm, false, false, false, true, false>
880{
881public:
882 inline __m128i operator()(const __m128i& a, const __m128i& b) const
883 {
884 CV_UNUSED(a);
885 return b;
886 }
887};
888
889template <int imm>
890class v_lsx_palignr_u8_class<imm, false, false, false, false, true>
891{
892public:
893 inline __m128i operator()(const __m128i& a, const __m128i& b) const
894 {
895 enum { imm2 = (sizeof(__m128i) - imm) };
896 return __lsx_vor_v(__lsx_vbsrl_v(a, imm), __lsx_vbsll_v(b, imm2));
897 }
898};
899
900template <int imm>
901inline __m128i v_lsx_palignr_u8(const __m128i& a, const __m128i& b)
902{
903 CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_lsx_palignr_u8");
904 return v_lsx_palignr_u8_class<imm>()(a, b);
905}
907#define OPENCV_HAL_IMPL_LSX_ROTATE_CAST(_Tpvec, cast) \
908 template<int imm> \
909 inline _Tpvec v_rotate_right(const _Tpvec &a) \
910 { \
911 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
912 __m128i ret = __lsx_vbsrl_v((__m128i)a.val, imm2); \
913 return _Tpvec(cast(ret)); \
914 } \
915 template<int imm> \
916 inline _Tpvec v_rotate_left(const _Tpvec &a) \
917 { \
918 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
919 __m128i ret = __lsx_vbsll_v((__m128i)a.val, imm2); \
920 return _Tpvec(cast(ret)); \
921 } \
922 template<int imm> \
923 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
924 { \
925 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))}; \
926 return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)a.val, (__m128i)b.val))); \
927 } \
928 template<int imm> \
929 inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
930 { \
931 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type))}; \
932 return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)b.val, (__m128i)a.val))); \
933 }
934
935OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint8x16, OPENCV_HAL_NOP) \
936OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int8x16, OPENCV_HAL_NOP) \
937OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint16x8, OPENCV_HAL_NOP) \
938OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int16x8, OPENCV_HAL_NOP) \
939OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint32x4, OPENCV_HAL_NOP) \
940OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int32x4, OPENCV_HAL_NOP) \
941OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint64x2, OPENCV_HAL_NOP) \
942OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int64x2, OPENCV_HAL_NOP) \
943
944OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float32x4, _lsx_128_castsi128_ps)
945OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float64x2, _lsx_128_castsi128_pd)
946
947
948inline v_uint8x16 v_reverse(const v_uint8x16 &a)
949{
950 __m128i vec = __lsx_vshuf4i_b(a.val, 0x1B);
951 return v_uint8x16(__lsx_vshuf4i_w(vec, 0x1B));
952}
953
954inline v_int8x16 v_reverse(const v_int8x16 &a)
955{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
956
957inline v_uint16x8 v_reverse(const v_uint16x8 &a)
958{
959 __m128i vec = __lsx_vshuf4i_h(a.val, 0x1B);
960 return v_uint16x8(__lsx_vshuf4i_w(vec, 0x4E));
961}
962
963inline v_int16x8 v_reverse(const v_int16x8 &a)
964{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
965
966inline v_uint32x4 v_reverse(const v_uint32x4 &a)
967{ return v_uint32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
968
969inline v_int32x4 v_reverse(const v_int32x4 &a)
970{ return v_int32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
971
972inline v_uint64x2 v_reverse(const v_uint64x2 &a)
973{ return v_uint64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
974
975inline v_int64x2 v_reverse(const v_int64x2 &a)
976{ return v_int64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
977
978inline v_float32x4 v_reverse(const v_float32x4 &a)
979{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
980
981inline v_float64x2 v_reverse(const v_float64x2 &a)
982{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
983
985
987// this function is return a[0]+a[1]+...+a[31]
988inline unsigned v_reduce_sum(const v_uint8x16& a)
989{
990 __m128i t1 = __lsx_vhaddw_hu_bu(a.val, a.val);
991 __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
992 __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
993 __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
994 return (unsigned)__lsx_vpickve2gr_w(t4, 0);
995}
996
997inline int v_reduce_sum(const v_int8x16 &a)
998{
999 __m128i t1 = __lsx_vhaddw_h_b(a.val, a.val);
1000 __m128i t2 = __lsx_vhaddw_w_h(t1, t1);
1001 __m128i t3 = __lsx_vhaddw_d_w(t2, t2);
1002 __m128i t4 = __lsx_vhaddw_q_d(t3, t3);
1003 return (int)__lsx_vpickve2gr_w(t4, 0);
1004}
1005
1006#define OPENCV_HAL_IMPL_LSX_REDUCE_16(_Tpvec, sctype, func, intrin) \
1007 inline sctype v_reduce_##func(const _Tpvec& a) \
1008 { \
1009 __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1010 val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1011 val = intrin(val, __lsx_vbsrl_v(val, 2)); \
1012 val = intrin(val, __lsx_vbsrl_v(val, 1)); \
1013 return (sctype)__lsx_vpickve2gr_b(val, 0); \
1014 }
1015
1016OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, min, __lsx_vmin_bu)
1017OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, max, __lsx_vmax_bu)
1018OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16, schar, min, __lsx_vmin_b)
1019OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16, schar, max, __lsx_vmax_b)
1020
1021#define OPENCV_HAL_IMPL_LSX_REDUCE_8(_Tpvec, sctype, func, intrin) \
1022 inline sctype v_reduce_##func(const _Tpvec &a) \
1023 { \
1024 __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1025 val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1026 val = intrin(val, __lsx_vbsrl_v(val, 2)); \
1027 return (sctype)__lsx_vpickve2gr_h(val, 0); \
1028 }
1029
1030OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, min, __lsx_vmin_hu)
1031OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, max, __lsx_vmax_hu)
1032OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8, short, min, __lsx_vmin_h)
1033OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8, short, max, __lsx_vmax_h)
1034
1035#define OPENCV_HAL_IMPL_LSX_REDUCE_4(_Tpvec, sctype, func, intrin) \
1036 inline sctype v_reduce_##func(const _Tpvec &a) \
1037 { \
1038 __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8)); \
1039 val = intrin(val, __lsx_vbsrl_v(val, 4)); \
1040 return (sctype)__lsx_vpickve2gr_w(val, 0); \
1041 }
1042
1043OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, min, __lsx_vmin_wu)
1044OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, max, __lsx_vmax_wu)
1045OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4, int, min, __lsx_vmin_w)
1046OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4, int, max, __lsx_vmax_w)
1047
1048#define OPENCV_HAL_IMPL_LSX_REDUCE_FLT(func, intrin) \
1049 inline float v_reduce_##func(const v_float32x4 &a) \
1050 { \
1051 __m128 val = a.val; \
1052 val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 8)); \
1053 val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 4)); \
1054 float *fval = (float*)&val; \
1055 return fval[0]; \
1056 }
1057
1058OPENCV_HAL_IMPL_LSX_REDUCE_FLT(min, __lsx_vfmin_s)
1059OPENCV_HAL_IMPL_LSX_REDUCE_FLT(max, __lsx_vfmax_s)
1060
1061inline int v_reduce_sum(const v_int32x4 &a)
1062{
1063 __m128i t1 = __lsx_vhaddw_d_w(a.val, a.val);
1064 __m128i t2 = __lsx_vhaddw_q_d(t1, t1);
1065 return (int)__lsx_vpickve2gr_w(t2, 0);
1066}
1067
1068inline unsigned v_reduce_sum(const v_uint32x4 &a)
1069{
1070 __m128i t1 = __lsx_vhaddw_du_wu(a.val, a.val);
1071 __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1072 return (int)__lsx_vpickve2gr_w(t2, 0);
1073}
1074
1075inline int v_reduce_sum(const v_int16x8 &a)
1076{
1077 __m128i t1 = __lsx_vhaddw_w_h(a.val, a.val);
1078 __m128i t2 = __lsx_vhaddw_d_w(t1, t1);
1079 __m128i t3 = __lsx_vhaddw_q_d(t2, t2);
1080 return (int)__lsx_vpickve2gr_w(t3, 0);
1081}
1082
1083inline unsigned v_reduce_sum(const v_uint16x8 &a)
1084{
1085 __m128i t1 = __lsx_vhaddw_wu_hu(a.val, a.val);
1086 __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1087 __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1088 return (int)__lsx_vpickve2gr_w(t3, 0);
1089}
1090
1091inline float v_reduce_sum(const v_float32x4 &a)
1092{
1093 __m128i val = (__m128i)a.val;
1094 val = __lsx_vbsrl_v(val, 8);
1095 __m128 result = __lsx_vfadd_s(a.val, (__m128)val);
1096 float *pa = (float*)&result;
1097 return (float)(pa[0] + pa[1]);
1098}
1099
1100inline uint64 v_reduce_sum(const v_uint64x2 &a)
1101{
1102 __m128i t0 = __lsx_vhaddw_qu_du(a.val, a.val);
1103 return (uint64)__lsx_vpickve2gr_du(t0, 0);
1104}
1105
1106inline int64 v_reduce_sum(const v_int64x2 &a)
1107{
1108 __m128i t0 = __lsx_vhaddw_q_d(a.val, a.val);
1109 return (int64)__lsx_vpickve2gr_d(t0, 0);
1110}
1111
1112inline double v_reduce_sum(const v_float64x2 &a)
1113{
1114 double *pa = (double*)&a;
1115 return pa[0] + pa[1];
1116}
1117
1118inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1119 const v_float32x4& c, const v_float32x4& d)
1120{
1121 __m128i a0 = (__m128i)a.val;
1122 __m128i b0 = (__m128i)b.val;
1123 __m128i c0 = (__m128i)c.val;
1124 __m128i d0 = (__m128i)d.val;
1125 __m128i ac_l = __lsx_vilvl_w(c0, a0);
1126 __m128i ac_h = __lsx_vilvh_w(c0, a0);
1127 __m128i bd_l = __lsx_vilvl_w(d0, b0);
1128 __m128i bd_h = __lsx_vilvh_w(d0, b0);
1129 __m128 ac = __lsx_vfadd_s((__m128)ac_l, (__m128)ac_h);
1130 __m128 bd = __lsx_vfadd_s((__m128)bd_l, (__m128)bd_h);
1131 return v_float32x4(__lsx_vfadd_s((__m128)__lsx_vilvl_w((__m128i)bd, (__m128i)ac),
1132 (__m128)__lsx_vilvh_w((__m128i)bd, (__m128i)ac)));
1133}
1134
1135inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1136{
1137 __m128i t0 = __lsx_vabsd_b(a.val, b.val);
1138 __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
1139 __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
1140 __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
1141 __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
1142 return (unsigned)__lsx_vpickve2gr_w(t4, 0);
1143}
1144
1145inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1146{
1147 __m128i t0 = __lsx_vabsd_bu(a.val, b.val);
1148 __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
1149 __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
1150 __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
1151 __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
1152 return (unsigned)__lsx_vpickve2gr_w(t4, 0);
1153}
1154
1155inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1156{
1157 __m128i t0 = __lsx_vabsd_hu(a.val, b.val);
1158 __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
1159 __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1160 __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1161 return (unsigned)__lsx_vpickve2gr_w(t3, 0);
1162}
1163
1164inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1165{
1166 __m128i t0 = __lsx_vabsd_h(a.val, b.val);
1167 __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
1168 __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
1169 __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
1170 return (unsigned)__lsx_vpickve2gr_w(t3, 0);
1171}
1172
1173inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1174{
1175 __m128i t0 = __lsx_vabsd_wu(a.val, b.val);
1176 __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
1177 __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1178 return (unsigned)__lsx_vpickve2gr_w(t2, 0);
1179}
1180
1181inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1182{
1183 __m128i t0 = __lsx_vabsd_w(a.val, b.val);
1184 __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
1185 __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
1186 return (unsigned)__lsx_vpickve2gr_w(t2, 0);
1187}
1188
1189inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1190{
1191 v_float32x4 a_b = a - b;
1192 return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff)));
1193}
1194
1196#define OPENCV_HAL_IMPL_LSX_POPCOUNT(_Tpvec, _Tp, suffix) \
1197inline _Tpvec v_popcount(const _Tp& a) \
1198{ return _Tpvec(__lsx_vpcnt_##suffix(a.val)); }
1199
1200OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16, v_uint8x16, b);
1201OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16, v_int8x16, b);
1202OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8, v_uint16x8, h);
1203OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8, v_int16x8, h);
1204OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4, v_uint32x4, w);
1205OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4, v_int32x4, w);
1206OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2, v_uint64x2, d);
1207OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2, v_int64x2, d);
1208
1210#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
1211inline tt reinterpret_int(ft x) { union {ft l; tt i;} v; v.l = x; return v.i; }
1212OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
1213OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
1214OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
1215OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
1216OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
1217OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
1218OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
1219OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
1220OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
1221OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
1222
1223inline int v_signmask(const v_int8x16& a)
1224{
1225 __m128i result = __lsx_vmskltz_b(a.val);
1226 return __lsx_vpickve2gr_w(result, 0);
1227}
1228inline int v_signmask(const v_uint8x16& a)
1229{ return v_signmask(v_reinterpret_as_s8(a)) ;}
1230
1231inline int v_signmask(const v_int16x8 &a)
1232{
1233 __m128i result = __lsx_vmskltz_h(a.val);
1234 return __lsx_vpickve2gr_w(result, 0);
1235}
1236inline int v_signmask(const v_uint16x8 &a)
1237{ return v_signmask(v_reinterpret_as_s16(a)); }
1238
1239inline int v_signmask(const v_uint32x4& a)
1240{
1241 __m128i result = __lsx_vmskltz_w(a.val);
1242 return __lsx_vpickve2gr_w(result, 0);
1243}
1244inline int v_signmask(const v_int32x4& a)
1245{ return v_signmask(v_reinterpret_as_u32(a)); }
1246
1247inline int v_signmask(const v_uint64x2& a)
1248{
1249 __m128i result = __lsx_vmskltz_d(a.val);
1250 return __lsx_vpickve2gr_w(result, 0);
1251}
1252inline int v_signmask(const v_int64x2& a)
1253{ return v_signmask(v_reinterpret_as_u64(a)); }
1254
1255inline int v_signmask(const v_float32x4& a)
1256{ return v_signmask(*(v_int32x4*)(&a)); }
1257
1258inline int v_signmask(const v_float64x2& a)
1259{ return v_signmask(*(v_int64x2*)(&a)); }
1260
1261inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1262inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1263inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1264inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1265inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1266inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1267inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1268inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1269inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1270inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1271
1273#define OPENCV_HAL_IMPL_LSX_CHECK(_Tpvec, allmask) \
1274 inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
1275 inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
1276OPENCV_HAL_IMPL_LSX_CHECK(v_uint8x16, 65535)
1277OPENCV_HAL_IMPL_LSX_CHECK(v_int8x16, 65535)
1278OPENCV_HAL_IMPL_LSX_CHECK(v_uint16x8, 255);
1279OPENCV_HAL_IMPL_LSX_CHECK(v_int16x8, 255);
1280OPENCV_HAL_IMPL_LSX_CHECK(v_uint32x4, 15)
1281OPENCV_HAL_IMPL_LSX_CHECK(v_int32x4, 15)
1282OPENCV_HAL_IMPL_LSX_CHECK(v_uint64x2, 3)
1283OPENCV_HAL_IMPL_LSX_CHECK(v_int64x2, 3)
1284OPENCV_HAL_IMPL_LSX_CHECK(v_float32x4, 15)
1285OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3)
1286
1288
1290#define OPENCV_HAL_IMPL_LSX_MULADD(_Tpvec, suffix) \
1291 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1292 { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); } \
1293 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec &b, const _Tpvec& c) \
1294 { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); } \
1295 inline _Tpvec v_sqrt(const _Tpvec& x) \
1296 { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); } \
1297 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1298 { return v_fma(a, a, b * b); } \
1299 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1300 { return v_sqrt(v_fma(a, a, b * b)); }
1301
1302OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s)
1303OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d)
1304
1305inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1306{ return v_int32x4(__lsx_vmadd_w(c.val, a.val, b.val)); }
1307
1308inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1309{ return v_fma(a, b, c); }
1310
1311inline v_float32x4 v_invsqrt(const v_float32x4& x)
1312{
1313 return v_float32x4(__lsx_vfrsqrt_s(x.val));
1314}
1315
1316inline v_float64x2 v_invsqrt(const v_float64x2& x)
1317{
1318 return v_float64x2(__lsx_vfrsqrt_d(x.val));
1319}
1320
1322#define OPENCV_HAL_IMPL_LSX_ABS(_Tpvec, suffix) \
1323 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1324 { return v_u##_Tpvec(__lsx_vabsd_##suffix(x.val, __lsx_vldi(0))); }
1325
1326OPENCV_HAL_IMPL_LSX_ABS(int8x16, b)
1327OPENCV_HAL_IMPL_LSX_ABS(int16x8, h)
1328OPENCV_HAL_IMPL_LSX_ABS(int32x4, w)
1329
1330inline v_float32x4 v_abs(const v_float32x4& x)
1331{ return v_float32x4(*((__m128i*)&x) & __lsx_vreplgr2vr_w(0x7fffffff)); }
1332inline v_float64x2 v_abs(const v_float64x2& x)
1333{ return v_float64x2(*((__m128i*)&x) & __lsx_vreplgr2vr_d(0x7fffffffffffffff)); }
1334
1337inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1338{ return (v_uint8x16)__lsx_vabsd_bu(a.val, b.val); }
1339inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1340{ return (v_uint16x8)__lsx_vabsd_hu(a.val, b.val); }
1341inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1342{ return (v_uint32x4)__lsx_vabsd_wu(a.val, b.val); }
1343
1344inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1345{ return (v_uint8x16)__lsx_vabsd_b(a.val, b.val); }
1346inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1347{ return (v_uint16x8)__lsx_vabsd_h(a.val, b.val); }
1348inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1349{ return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
1350
1351inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
1352{ return v_abs(a - b); }
1353
1354inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
1355{ return v_abs(a - b); }
1356
1358inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1359{
1360 v_int8x16 d = a - b;
1361 v_int8x16 m = a < b;
1362 return (d ^ m) - m;
1363}
1364inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1365{ return v_max(a, b) - v_min(a, b); }
1366
1368
1370inline v_int32x4 v_round(const v_float32x4& a)
1371{ return v_int32x4(__lsx_vftint_w_s(a.val)); }
1372
1373inline v_int32x4 v_round(const v_float64x2& a)
1374{ return v_int32x4(__lsx_vftint_w_d(a.val, a.val)); }
1375
1376inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1377{ return v_int32x4(__lsx_vftint_w_d(b.val, a.val)); }
1378
1379inline v_int32x4 v_trunc(const v_float32x4& a)
1380{ return v_int32x4(__lsx_vftintrz_w_s(a.val)); }
1381
1382inline v_int32x4 v_trunc(const v_float64x2& a)
1383{ return v_int32x4(__lsx_vftintrz_w_d(a.val, a.val)); }
1384
1385inline v_int32x4 v_floor(const v_float32x4& a)
1386{ return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrm_s(a.val)))); }
1387
1388inline v_int32x4 v_floor(const v_float64x2& a)
1389{ return v_trunc(v_float64x2(__lsx_vfrintrm_d(a.val))); }
1390
1391inline v_int32x4 v_ceil(const v_float32x4& a)
1392{ return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrp_s(a.val)))); }
1393
1394inline v_int32x4 v_ceil(const v_float64x2& a)
1395{ return v_trunc(v_float64x2(__lsx_vfrintrp_d(a.val))); }
1396
1398inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1399{ return v_float32x4(__lsx_vffint_s_w(a.val)); }
1400
1401inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1402{ return v_float32x4(__lsx_vfcvt_s_d(a.val, a.val)); }
1403
1404inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1405{ return v_float32x4(__lsx_vfcvt_s_d(b.val, a.val)); }
1406
1407inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1408{ return v_float64x2(__lsx_vffintl_d_w(a.val)); }
1409
1410inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1411{ return v_float64x2(__lsx_vffinth_d_w(a.val)); }
1412
1413inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1414{ return v_float64x2(__lsx_vfcvtl_d_s(a.val)); }
1415
1416inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1417{ return v_float64x2(__lsx_vfcvth_d_s(a.val)); }
1418
1419inline v_float64x2 v_cvt_f64(const v_int64x2& v)
1420{ return v_float64x2(__lsx_vffint_d_l(v.val)); }
1421
1422
1424inline v_int8x16 v_lut(const schar* tab, const int* idx)
1425{
1426 return v_int8x16(_v128_setr_b(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1427 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]], tab[idx[8]],
1428 tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]],
1429 tab[idx[14]], tab[idx[15]]));
1430}
1431
1432inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1433{
1434 return v_int8x16(_v128_setr_h(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]),
1435 *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]), *(const short*)(tab + idx[4]),
1436 *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
1437}
1438
1439inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1440{
1441 return v_int8x16(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
1442 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
1443}
1444
1445inline v_uint8x16 v_lut(const uchar* tab, const int* idx)
1446{ return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
1447inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx)
1448{ return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
1449inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx)
1450{ return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
1451
1452inline v_int16x8 v_lut(const short* tab, const int* idx)
1453{
1454 return v_int16x8(_v128_setr_h(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1455 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
1456}
1457inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1458{
1459 return v_int16x8(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
1460 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
1461}
1462inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1463{
1464 return v_int16x8(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1])));
1465}
1466
1467inline v_uint16x8 v_lut(const ushort* tab, const int* idx)
1468{ return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
1469inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx)
1470{ return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
1471inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx)
1472{ return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
1473
1474inline v_int32x4 v_lut(const int* tab, const int* idx)
1475{
1476 return v_int32x4(_v128_setr_w(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
1477}
1478inline v_int32x4 v_lut_pairs(const int *tab, const int* idx)
1479{
1480 return v_int32x4(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1])));
1481}
1482inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1483{
1484 return v_int32x4(__lsx_vld(tab + idx[0], 0));
1485}
1486
1487inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
1488inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
1489inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
1490
1491inline v_int64x2 v_lut(const int64_t* tab, const int *idx)
1492{
1493 return v_int64x2(_v128_setr_d(tab[idx[0]], tab[idx[1]]));
1494}
1495inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1496{
1497 return v_int64x2(__lsx_vld(tab + idx[0], 0));
1498}
1499
1500inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1501inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1502
1503inline v_float32x4 v_lut(const float* tab, const int* idx)
1504{
1505 return v_float32x4(_v128_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
1506}
1507inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1508{
1509 return v_float32x4((__m128)_v128_setr_pd(*(const double*)(tab + idx[0]), *(const double*)(tab + idx[1])));
1510}
1511inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1512{
1513 return v_float32x4((__m128)__lsx_vld(tab + idx[0], 0));
1514}
1515
1516inline v_float64x2 v_lut(const double* tab, const int* idx)
1517{
1518 return v_float64x2(_v128_setr_pd(tab[idx[0]], tab[idx[1]]));
1519}
1520inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1521{
1522 return v_float64x2((__m128d)__lsx_vld(tab + idx[0], 0));
1523}
1524
1525inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1526{
1527 int *idx = (int*)&idxvec.val;
1528 return v_lut(tab, idx);
1529}
1530
1531inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1532{
1533 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
1534}
1535
1536inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1537{
1538 const int *idx = (const int*)&idxvec.val;
1539 return v_lut(tab, idx);
1540}
1541
1542inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1543{
1544 const int *idx = (const int*)&idxvec.val;
1545 return v_lut(tab, idx);
1546}
1547
1548inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1549{
1550 const int *idx = (const int*)&idxvec.val;
1551 __m128i xy0 = __lsx_vld(tab + idx[0], 0);
1552 __m128i xy1 = __lsx_vld(tab + idx[1], 0);
1553 __m128i xy2 = __lsx_vld(tab + idx[2], 0);
1554 __m128i xy3 = __lsx_vld(tab + idx[3], 0);
1555 __m128i xy01 = __lsx_vilvl_d(xy1, xy0);
1556 __m128i xy23 = __lsx_vilvl_d(xy3, xy2);
1557 __m128i xxyy02 = __lsx_vilvl_w(xy23, xy01);
1558 __m128i xxyy13 = __lsx_vilvh_w(xy23, xy01);
1559 x = v_float32x4((__m128)__lsx_vilvl_w(xxyy13, xxyy02));
1560 y = v_float32x4((__m128)__lsx_vilvh_w(xxyy13, xxyy02));
1561}
1562
1563inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1564{
1565 const int* idx = (const int*)&idxvec.val;
1566 __m128i xy0 = __lsx_vld(tab + idx[0], 0);
1567 __m128i xy1 = __lsx_vld(tab + idx[1], 0);
1568 x = v_float64x2((__m128d)__lsx_vilvl_d(xy1, xy0));
1569 y = v_float64x2((__m128d)__lsx_vilvh_d(xy1, xy0));
1570}
1571
1572inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
1573{
1574 return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
1575 _v128_setr_d(0x0705060403010200, 0x0f0d0e0c0b090a08)));
1576}
1577inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
1578{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1579inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
1580{
1581 return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
1582 _v128_setr_d(0x0703060205010400, 0x0f0b0e0a0d090c08)));
1583}
1584inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
1585{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1586
1587inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
1588{
1589 return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
1590 _v128_setr_d(0x0706030205040100, 0x0f0e0b0a0d0c0908)));
1591}
1592inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
1593{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1594inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
1595{
1596 return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
1597 _v128_setr_d(0x0b0a030209080100, 0x0f0e07060d0c0504)));
1598}
1599inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
1600{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1601
1602inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
1603{
1604 return v_int32x4(__lsx_vshuf4i_w(vec.val, 0xd8));
1605}
1606inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
1607{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1608
1610{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1611
1612inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
1613{
1614 __m128i zero = __lsx_vldi(0);
1615 return v_int8x16(__lsx_vshuf_b(zero, vec.val,
1616 _v128_set_d(0x1211100f0e0d0c0a, 0x0908060504020100)));
1617}
1618inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
1619{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1620
1621inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
1622{
1623 __m128i zero = __lsx_vldi(0);
1624 return v_int16x8(__lsx_vshuf_b(zero, vec.val,
1625 _v128_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100)));
1626}
1627inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
1628{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1629
1630inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
1631inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
1632inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
1633
1635
1637
1638// 16 >> 32
1639inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
1640{
1641 __m128i x = a.val, y = b.val;
1642 return v_int32x4(__lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y));
1643}
1644inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1645{
1646 __m128i x = a.val, y = b.val, z = c.val;
1647 __m128i t = __lsx_vmaddwev_w_h(z, x, y);
1648 return v_int32x4(__lsx_vmaddwod_w_h(t, x, y));
1649}
1650
1651// 32 >> 64
1652inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
1653{
1654 __m128i x = a.val, y = b.val;
1655 return v_int64x2(__lsx_vmaddwod_d_w(__lsx_vmulwev_d_w(x, y), x, y));
1656}
1657inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1658{
1659 __m128i x = a.val, y = b.val, z = c.val;
1660 __m128i t = __lsx_vmaddwev_d_w(z, x, y);
1661 return v_int64x2(__lsx_vmaddwod_d_w(t, x, y));
1662}
1663
1664// 8 >> 32
1665inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
1666{
1667 __m128i x = a.val, y = b.val;
1668 __m128i even = __lsx_vmulwev_h_bu(x, y);
1669 __m128i odd = __lsx_vmulwod_h_bu(x, y);
1670 __m128i prod0 = __lsx_vhaddw_wu_hu(even, even);
1671 __m128i prod1 = __lsx_vhaddw_wu_hu(odd, odd);
1672 return v_uint32x4(__lsx_vadd_w(prod0, prod1));
1673}
1674
1675inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1676{ return v_dotprod_expand(a, b) + c ;}
1677
1678inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
1679{
1680 __m128i x = a.val, y = b.val;
1681 __m128i even = __lsx_vmulwev_h_b(x, y);
1682 __m128i odd = __lsx_vmulwod_h_b(x, y);
1683 __m128i prod0 = __lsx_vhaddw_w_h(even, even);
1684 __m128i prod1 = __lsx_vhaddw_w_h(odd, odd);
1685 return v_int32x4(__lsx_vadd_w(prod0, prod1));
1686}
1687inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1688{ return v_dotprod_expand(a, b) + c; }
1689
1690// 16 >> 64
1691inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
1692{
1693 __m128i x = a.val, y = b.val;
1694 __m128i even = __lsx_vmulwev_w_hu(x, y);
1695 __m128i odd = __lsx_vmulwod_w_hu(x, y);
1696 __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
1697 __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
1698 return v_uint64x2(__lsx_vadd_d(prod0, prod1));
1699}
1700inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1701{ return v_dotprod_expand(a, b) + c; }
1702
1703inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
1704{
1705 __m128i x = a.val, y = b.val;
1706 __m128i even = __lsx_vmulwev_w_h(x, y);
1707 __m128i odd = __lsx_vmulwod_w_h(x, y);
1708 __m128i prod0 = __lsx_vhaddw_d_w(even, even);
1709 __m128i prod1 = __lsx_vhaddw_d_w(odd, odd);
1710 return v_int64x2(__lsx_vadd_d(prod0, prod1));
1711}
1712inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1713{ return v_dotprod_expand(a, b) + c; }
1714
1715//32 >> 64f
1716inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
1717{ return v_cvt_f64(v_dotprod(a, b)); }
1718inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1719{ return v_dotprod_expand(a, b) + c; }
1720
1721
1723
1724// 16 >> 32
1725inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
1726{ return v_dotprod(a, b); }
1727inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1728{ return v_dotprod(a, b, c); }
1729
1730// 32 >> 64
1731inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
1732{ return v_dotprod(a, b); }
1733inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1734{ return v_dotprod(a, b, c); }
1735
1736// 8 >> 32
1737inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1738{ return v_dotprod_expand(a, b); }
1739inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1740{ return v_dotprod_expand(a, b, c); }
1741
1742inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1743{ return v_dotprod_expand(a, b); }
1744inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1745{ return v_dotprod_expand(a, b, c); }
1746
1747// 16 >> 64
1748inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1749{
1750 __m128i x = a.val, y = b.val;
1751 __m128i even = __lsx_vmulwev_w_hu(x, y);
1752 __m128i odd = __lsx_vmulwod_w_hu(x, y);
1753 __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
1754 __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
1755 return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
1756}
1757inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1758{ return v_dotprod_expand_fast(a, b) + c; }
1759
1760inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1761{
1762 __m128i x = a.val, y = b.val;
1763 __m128i prod = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y);
1764 __m128i sign = __lsx_vsrai_w(prod, 31);
1765 __m128i lo = __lsx_vilvl_w(sign, prod);
1766 __m128i hi = __lsx_vilvh_w(sign, prod);
1767 return v_int64x2(__lsx_vadd_d(lo, hi));
1768}
1769inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1770{ return v_dotprod_expand_fast(a, b) + c; }
1771
1772// 32 >> 64f
1773inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1774{ return v_dotprod_expand(a, b); }
1775inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1776{ return v_dotprod_expand(a, b, c); }
1777
1778inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
1779 const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3)
1780{
1781 __m128i x = (__m128i)v.val;
1782 __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val);
1783 __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val);
1784 __m128 v2 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val);
1785 __m128 v3 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xFF), m3.val);
1786
1787 return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), __lsx_vfadd_s(v2, v3)));
1788}
1789
1790inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
1791 const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& a)
1792{
1793 __m128i x = (__m128i)v.val;
1794 __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val);
1795 __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val);
1796 __m128 v2 = __lsx_vfmadd_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val, a.val);
1797
1798 return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), v2));
1799}
1800
1801#define OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(_Tpvec, cast_from, cast_to) \
1802 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1803 const _Tpvec& a2, const _Tpvec& a3, \
1804 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1805 { \
1806 __m128i t0 = cast_from(__lsx_vilvl_w(a1.val, a0.val)); \
1807 __m128i t1 = cast_from(__lsx_vilvl_w(a3.val, a2.val)); \
1808 __m128i t2 = cast_from(__lsx_vilvh_w(a1.val, a0.val)); \
1809 __m128i t3 = cast_from(__lsx_vilvh_w(a3.val, a2.val)); \
1810 b0.val = cast_to(__lsx_vilvl_d(t1, t0)); \
1811 b1.val = cast_to(__lsx_vilvh_d(t1, t0)); \
1812 b2.val = cast_to(__lsx_vilvl_d(t3, t2)); \
1813 b3.val = cast_to(__lsx_vilvh_d(t3, t2)); \
1814 }
1815
1816OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_uint32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1817OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_int32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1818
1819inline void v_transpose4x4(const v_float32x4& a0, const v_float32x4& a1,
1820 const v_float32x4& a2, const v_float32x4& a3,
1822{
1823 __m128i vec0 = (__m128i)a0.val, vec1 = (__m128i)a1.val;
1824 __m128i vec2 = (__m128i)a2.val, vec3 = (__m128i)a3.val;
1825 __m128i t0 = __lsx_vilvl_w(vec1, vec0);
1826 __m128i t1 = __lsx_vilvl_w(vec3, vec2);
1827 __m128i t2 = __lsx_vilvh_w(vec1, vec0);
1828 __m128i t3 = __lsx_vilvh_w(vec3, vec2);
1829 b0.val = __m128(__lsx_vilvl_d(t1, t0));
1830 b1.val = __m128(__lsx_vilvh_d(t1, t0));
1831 b2.val = __m128(__lsx_vilvl_d(t3, t2));
1832 b3.val = __m128(__lsx_vilvh_d(t3, t2));
1833}
1834
1836
1837/* Expand */
1838#define OPENCV_HAL_IMPL_LSX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin_lo, intrin_hi) \
1839 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1840 { \
1841 b0.val = intrin_lo(a.val, 0); \
1842 b1.val = intrin_hi(a.val); \
1843 } \
1844 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1845 { return _Tpwvec(intrin_lo(a.val, 0)); } \
1846 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1847 { return _Tpwvec(intrin_hi(a.val)); } \
1848 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1849 { \
1850 __m128i a = __lsx_vld(ptr, 0); \
1851 return _Tpwvec(intrin_lo(a, 0)); \
1852 }
1853
1854OPENCV_HAL_IMPL_LSX_EXPAND(v_uint8x16, v_uint16x8, uchar, __lsx_vsllwil_hu_bu, __lsx_vexth_hu_bu)
1855OPENCV_HAL_IMPL_LSX_EXPAND(v_int8x16, v_int16x8, schar, __lsx_vsllwil_h_b, __lsx_vexth_h_b)
1856OPENCV_HAL_IMPL_LSX_EXPAND(v_uint16x8, v_uint32x4, ushort, __lsx_vsllwil_wu_hu, __lsx_vexth_wu_hu)
1857OPENCV_HAL_IMPL_LSX_EXPAND(v_int16x8, v_int32x4, short, __lsx_vsllwil_w_h, __lsx_vexth_w_h)
1858OPENCV_HAL_IMPL_LSX_EXPAND(v_uint32x4, v_uint64x2, unsigned, __lsx_vsllwil_du_wu, __lsx_vexth_du_wu)
1859OPENCV_HAL_IMPL_LSX_EXPAND(v_int32x4, v_int64x2, int, __lsx_vsllwil_d_w, __lsx_vexth_d_w)
1860
1861#define OPENCV_HAL_IMPL_LSX_EXPAND_Q(_Tpvec, _Tp, intrin_lo, intrin_hi) \
1862 inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1863 { \
1864 __m128i a = __lsx_vld(ptr, 0); \
1865 __m128i b = intrin_lo(a, 0); \
1866 return _Tpvec(intrin_hi(b, 0)); \
1867 }
1868
1869OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_uint32x4, uchar, __lsx_vsllwil_hu_bu, __lsx_vsllwil_wu_hu)
1870OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_int32x4, schar, __lsx_vsllwil_h_b, __lsx_vsllwil_w_h)
1871
1872/* pack */
1873// 16
1874inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
1875{ return v_int8x16(_lsx_packs_h(a.val, b.val)); }
1876
1877inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
1878{ return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, 0)); }
1879
1880inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
1881{ return v_uint8x16(_lsx_packus_h(a.val, b.val)); }
1882
1883inline void v_pack_store(schar* ptr, const v_int16x8& a)
1884{ v_store_low(ptr, v_pack(a, a)); }
1885
1886inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
1887{ v_store_low(ptr, v_pack(a, a)); }
1888
1889inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
1890{ v_store_low(ptr, v_pack_u(a, a)); }
1891
1892template<int n> inline
1893v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
1894{ return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, n)); }
1895
1896template<int n> inline
1897void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
1898{ __lsx_vstelm_d(__lsx_vssrlrni_bu_h(a.val, a.val, n), ptr, 0, 0); }
1899
1900template<int n> inline
1901v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
1902{ return v_uint8x16(__lsx_vssrarni_bu_h(b.val, a.val, n)); }
1903
1904template<int n> inline
1905void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
1906{ __lsx_vstelm_d(__lsx_vssrarni_bu_h(a.val, a.val, n), ptr, 0, 0); }
1907
1908template<int n> inline
1909v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
1910{ return v_int8x16(__lsx_vssrarni_b_h(b.val, a.val, n)); }
1911
1912template<int n> inline
1913void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
1914{ __lsx_vstelm_d(__lsx_vssrarni_b_h(a.val, a.val, n), ptr, 0, 0); }
1915
1916//32
1917inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
1918{ return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, 0)); }
1919
1920inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
1921{ return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, 0)); }
1922
1923inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
1924{ return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, 0)); }
1925
1926inline void v_pack_store(short* ptr, const v_int32x4& a)
1927{ v_store_low(ptr, v_pack(a, a)); }
1928
1929inline void v_pack_store(ushort *ptr, const v_uint32x4& a)
1930{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
1931
1932inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
1933{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
1934
1935template<int n> inline
1936v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
1937{ return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, n)); }
1938
1939template<int n> inline
1940void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
1941{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, n), ptr, 0, 0); }
1942
1943template<int n> inline
1944v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
1945{ return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, n)); }
1946
1947template<int n> inline
1948void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
1949{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, n), ptr, 0, 0); }
1950
1951template<int n> inline
1952v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
1953{ return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, n)); }
1954
1955template<int n> inline
1956void v_rshr_pack_store(short* ptr, const v_int32x4& a)
1957{ __lsx_vstelm_d(__lsx_vssrarni_h_w(a.val, a.val, n), ptr, 0, 0); }
1958
1959// 64
1960// Non-saturaing pack
1961inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
1962{ return v_uint32x4(__lsx_vpickev_w(b.val, a.val)); }
1963
1964inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
1965{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
1966
1967inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
1968{ __lsx_vstelm_d(__lsx_vshuf4i_w(a.val, 0x08), ptr, 0, 0); }
1969
1970inline void v_pack_store(int *ptr, const v_int64x2& a)
1971{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(a)); }
1972
1973template<int n> inline
1974v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
1975{ return v_uint32x4(__lsx_vsrlrni_w_d(b.val, a.val, n)); }
1976
1977template<int n> inline
1978void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
1979{ __lsx_vstelm_d(__lsx_vsrlrni_w_d(a.val, a.val, n), ptr, 0, 0); }
1980
1981template<int n> inline
1982v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
1983{ return v_int32x4(__lsx_vsrarni_w_d(b.val, a.val, n)); }
1984
1985template<int n> inline
1986void v_rshr_pack_store(int* ptr, const v_int64x2& a)
1987{ __lsx_vstelm_d(__lsx_vsrarni_w_d(a.val, a.val, n), ptr, 0, 0); }
1988
1989// pack boolean
1990inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1991{ return v_uint8x16(__lsx_vssrarni_b_h(b.val, a.val, 0)); }
1992
1993inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1994 const v_uint32x4& c, const v_uint32x4& d)
1995{
1996 __m128i ab = __lsx_vssrarni_h_w(b.val, a.val, 0);
1997 __m128i cd = __lsx_vssrarni_h_w(d.val, c.val, 0);
1998 return v_uint8x16(__lsx_vssrarni_b_h(cd, ab, 0));
1999}
2000
2001inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
2002 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
2003 const v_uint64x2& g, const v_uint64x2& h)
2004{
2005 __m128i ab = __lsx_vssrarni_w_d(b.val, a.val, 0);
2006 __m128i cd = __lsx_vssrarni_w_d(d.val, c.val, 0);
2007 __m128i ef = __lsx_vssrarni_w_d(f.val, e.val, 0);
2008 __m128i gh = __lsx_vssrarni_w_d(h.val, g.val, 0);
2009
2010 __m128i abcd = __lsx_vssrarni_h_w(cd, ab, 0);
2011 __m128i efgh = __lsx_vssrarni_h_w(gh, ef, 0);
2012 return v_uint8x16(__lsx_vssrarni_b_h(efgh, abcd, 0));
2013}
2014
2015/* Recombine */
2016// its up there with load and store operations
2017
2018/* Extract */
2019#define OPENCV_HAL_IMPL_LSX_EXTRACT(_Tpvec) \
2020 template<int s> \
2021 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2022 { return v_rotate_right<s>(a, b); }
2023
2024OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint8x16)
2025OPENCV_HAL_IMPL_LSX_EXTRACT(v_int8x16)
2026OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint16x8)
2027OPENCV_HAL_IMPL_LSX_EXTRACT(v_int16x8)
2028OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint32x4)
2029OPENCV_HAL_IMPL_LSX_EXTRACT(v_int32x4)
2030OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint64x2)
2031OPENCV_HAL_IMPL_LSX_EXTRACT(v_int64x2)
2032OPENCV_HAL_IMPL_LSX_EXTRACT(v_float32x4)
2033OPENCV_HAL_IMPL_LSX_EXTRACT(v_float64x2)
2034
2035#define OPENCV_HAL_IMPL_LSX_EXTRACT_N(_Tpvec, _Twvec, intrin) \
2036template<int i> \
2037inline _Twvec v_extract_n(const _Tpvec& a) \
2038{ return (_Twvec)intrin(a.val, i); }
2039
2040OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint8x16, uchar, __lsx_vpickve2gr_b)
2041OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int8x16, schar, __lsx_vpickve2gr_b)
2042OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint16x8, ushort, __lsx_vpickve2gr_h)
2043OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int16x8, short, __lsx_vpickve2gr_h)
2044OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint32x4, uint, __lsx_vpickve2gr_w)
2045OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int32x4, int, __lsx_vpickve2gr_w)
2046OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint64x2, uint64, __lsx_vpickve2gr_d)
2047OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int64x2, int64, __lsx_vpickve2gr_d)
2048
2049template<int i>
2050inline float v_extract_n(const v_float32x4& v)
2051{
2052 union { uint iv; float fv; } d;
2053 d.iv = __lsx_vpickve2gr_w(v.val, i);
2054 return d.fv;
2055}
2056
2057template<int i>
2058inline double v_extract_n(const v_float64x2& v)
2059{
2060 union { uint64 iv; double dv; } d;
2061 d.iv = __lsx_vpickve2gr_d(v.val, i);
2062 return d.dv;
2063}
2064
2065template<int i>
2067{ return v_uint32x4(__lsx_vreplvei_w(a.val, i)); }
2068
2069template<int i>
2071{ return v_int32x4(__lsx_vreplvei_w(a.val, i)); }
2072
2073template<int i>
2075{ return v_float32x4((__m128)__lsx_vreplvei_w((__m128i)a.val, i)); }
2076
2078
2079inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
2080{
2081 __m128i t0 = __lsx_vld(ptr, 0);
2082 __m128i t1 = __lsx_vld(ptr, 16);
2083
2084 a.val = __lsx_vpickev_b(t1, t0);
2085 b.val = __lsx_vpickod_b(t1, t0);
2086}
2087
2088inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2089{
2090 __m128i t0 = __lsx_vld(ptr, 0);
2091 __m128i t1 = __lsx_vld(ptr, 16);
2092 a.val = __lsx_vpickev_h(t1, t0);
2093 b.val = __lsx_vpickod_h(t1, t0);
2094}
2095
2096inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2097{
2098 __m128i t0 = __lsx_vld(ptr, 0);
2099 __m128i t1 = __lsx_vld(ptr, 16);
2100 a.val = __lsx_vpickev_w(t1, t0);
2101 b.val = __lsx_vpickod_w(t1, t0);
2102}
2103
2104inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b)
2105{
2106 __m128i t0 = __lsx_vld(ptr, 0);
2107 __m128i t1 = __lsx_vld(ptr, 16);
2108 a.val = __lsx_vilvl_d(t1, t0);
2109 b.val = __lsx_vilvh_d(t1, t0);
2110}
2111
2112inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
2113{
2114 __m128i t0 = __lsx_vld(ptr, 0);
2115 __m128i t1 = __lsx_vld(ptr, 16);
2116 __m128i t2 = __lsx_vld(ptr, 32);
2117 const __m128i shuff0 = _v128_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2118 const __m128i shuff1 = _v128_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2119 __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff0);
2120 __m128i b0 = __lsx_vbitsel_v(t1, t0, shuff1);
2121 __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
2122 const __m128i shuff_a = _v128_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29);
2123 const __m128i shuff_b = _v128_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30);
2124 const __m128i shuff_c = _v128_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31);
2125
2126 a.val = __lsx_vshuf_b(t2, a0, shuff_a);
2127 b.val = __lsx_vshuf_b(t2, b0, shuff_b);
2128 c.val = __lsx_vshuf_b(t2, c0, shuff_c);
2129}
2130
2131inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2132{
2133 __m128i t0 = __lsx_vld(ptr, 0);
2134 __m128i t1 = __lsx_vld(ptr, 16);
2135 __m128i t2 = __lsx_vld(ptr, 32);
2136 const __m128i shuff0 = _v128_setr_h(0, 0, -1, 0, 0, -1, 0, 0);
2137 const __m128i shuff1 = _v128_setr_h(0, -1, 0, 0, -1, 0, 0, -1);
2138
2139 __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff1);
2140 __m128i b0 = __lsx_vbitsel_v(t0, t1, shuff0);
2141 __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
2142
2143 const __m128i shuff_a = _v128_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 20, 21, 26, 27);
2144 const __m128i shuff_b = _v128_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 16, 17, 22, 23, 28, 29);
2145 const __m128i shuff_c = _v128_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31);
2146
2147 a.val = __lsx_vshuf_b(t2, a0, shuff_a);
2148 b.val = __lsx_vshuf_b(t2, b0, shuff_b);
2149 c.val = __lsx_vshuf_b(t2, c0, shuff_c);
2150}
2151
2152inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2153{
2154 __m128i t0 = __lsx_vld(ptr, 0);
2155 __m128i t1 = __lsx_vld(ptr, 16);
2156 __m128i t2 = __lsx_vld(ptr, 32);
2157
2158 __m128i a0 = __lsx_vpermi_w(t1, t0, 0xAC);
2159 __m128i b0 = __lsx_vpermi_w(t1, t0, 0xC5);
2160 __m128i c0 = __lsx_vpermi_w(t1, t0, 0x5A);
2161
2162 a.val = __lsx_vextrins_w(a0, t2, 0x31);
2163 b0 = __lsx_vshuf4i_w(b0, 0x38);
2164 c0 = __lsx_vshuf4i_w(c0, 0x8);
2165 b.val = __lsx_vextrins_w(b0, t2, 0x32);
2166 c.val = __lsx_vpermi_w(t2, c0, 0xC4);
2167}
2168
2169inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2170{
2171 __m128i t0 = __lsx_vld(ptr, 0);
2172 __m128i t1 = __lsx_vld(ptr, 16);
2173 __m128i t2 = __lsx_vld(ptr, 32);
2174
2175 a.val = __lsx_vshuf4i_d(t0, t1, 0xC);
2176 b.val = __lsx_vshuf4i_d(t0, t2, 0x9);
2177 c.val = __lsx_vshuf4i_d(t1, t2, 0xC);
2178}
2179
2180inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2181{
2182 __m128i t0 = __lsx_vld(ptr, 0);
2183 __m128i t1 = __lsx_vld(ptr, 16);
2184 __m128i t2 = __lsx_vld(ptr, 32);
2185 __m128i t3 = __lsx_vld(ptr, 48);
2186
2187 __m128i ac_lo = __lsx_vpickev_b(t1, t0);
2188 __m128i bd_lo = __lsx_vpickod_b(t1, t0);
2189 __m128i ac_hi = __lsx_vpickev_b(t3, t2);
2190 __m128i bd_hi = __lsx_vpickod_b(t3, t2);
2191
2192 a.val = __lsx_vpickev_b(ac_hi, ac_lo);
2193 c.val = __lsx_vpickod_b(ac_hi, ac_lo);
2194 b.val = __lsx_vpickev_b(bd_hi, bd_lo);
2195 d.val = __lsx_vpickod_b(bd_hi, bd_lo);
2196}
2197
2198inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2199{
2200 __m128i t0 = __lsx_vld(ptr, 0);
2201 __m128i t1 = __lsx_vld(ptr, 16);
2202 __m128i t2 = __lsx_vld(ptr, 32);
2203 __m128i t3 = __lsx_vld(ptr, 48);
2204
2205 __m128i ac_lo = __lsx_vpickev_h(t1, t0);
2206 __m128i bd_lo = __lsx_vpickod_h(t1, t0);
2207 __m128i ac_hi = __lsx_vpickev_h(t3, t2);
2208 __m128i bd_hi = __lsx_vpickod_h(t3, t2);
2209
2210 a.val = __lsx_vpickev_h(ac_hi, ac_lo);
2211 c.val = __lsx_vpickod_h(ac_hi, ac_lo);
2212 b.val = __lsx_vpickev_h(bd_hi, bd_lo);
2213 d.val = __lsx_vpickod_h(bd_hi, bd_lo);
2214}
2215
2216inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2217{
2218 __m128i p0 = __lsx_vld(ptr, 0);
2219 __m128i p1 = __lsx_vld(ptr, 16);
2220 __m128i p2 = __lsx_vld(ptr, 32);
2221 __m128i p3 = __lsx_vld(ptr, 48);
2222
2223 __m128i t0 = __lsx_vilvl_w(p1, p0);
2224 __m128i t1 = __lsx_vilvl_w(p3, p2);
2225 __m128i t2 = __lsx_vilvh_w(p1, p0);
2226 __m128i t3 = __lsx_vilvh_w(p3, p2);
2227 a.val = __lsx_vilvl_d(t1, t0);
2228 b.val = __lsx_vilvh_d(t1, t0);
2229 c.val = __lsx_vilvl_d(t3, t2);
2230 d.val = __lsx_vilvh_d(t3, t2);
2231}
2232
2233inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2234{
2235 __m128i t0 = __lsx_vld(ptr, 0);
2236 __m128i t1 = __lsx_vld(ptr, 16);
2237 __m128i t2 = __lsx_vld(ptr, 32);
2238 __m128i t3 = __lsx_vld(ptr, 48);
2239
2240 a.val = __lsx_vilvl_d(t2, t0);
2241 b.val = __lsx_vilvh_d(t2, t0);
2242 c.val = __lsx_vilvl_d(t3, t1);
2243 d.val = __lsx_vilvh_d(t3, t1);
2244}
2245
2247
2248inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2250{
2251 __m128i v0 = __lsx_vilvl_b(b.val, a.val);
2252 __m128i v1 = __lsx_vilvh_b(b.val, a.val);
2253
2254 __lsx_vst(v0, ptr, 0);
2255 __lsx_vst(v1, ptr, 16);
2256}
2257
2258inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2260{
2261 __m128i v0 = __lsx_vilvl_h(b.val, a.val);
2262 __m128i v1 = __lsx_vilvh_h(b.val, a.val);
2263
2264 __lsx_vst(v0, ptr, 0);
2265 __lsx_vst(v1, ptr, 16);
2266}
2267
2268inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2270{
2271 __m128i v0 = __lsx_vilvl_w(b.val, a.val);
2272 __m128i v1 = __lsx_vilvh_w(b.val, a.val);
2273
2274 __lsx_vst(v0, ptr, 0);
2275 __lsx_vst(v1, ptr, 16);
2276}
2277
2278inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b,
2280{
2281 __m128i v0 = __lsx_vilvl_d(b.val, a.val);
2282 __m128i v1 = __lsx_vilvh_d(b.val, a.val);
2283
2284 __lsx_vst(v0, ptr, 0);
2285 __lsx_vst(v1, ptr, 16);
2286}
2287
2288inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c,
2290{
2291 __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
2292 __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
2293 __m128i v_c = c.val;
2294 const __m128i shuff0 = _v128_setr_b(0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10);
2295 const __m128i shuff1 = _v128_setr_b(11, 21, 12, 13, 22, 14, 15, 23, 0, 0, 0, 0, 0, 0, 0, 0);
2296 const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 24, 18, 19, 25, 20, 21);
2297 const __m128i shuff3 = _v128_setr_b(26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31);
2298 __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
2299
2300 __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
2301 __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
2302 __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
2303 dst1 = __lsx_vshuf_b(abc, dst1, shuff2);
2304
2305 __lsx_vst(dst0, ptr, 0);
2306 __lsx_vst(dst1, ptr, 16);
2307 __lsx_vst(dst2, ptr, 32);
2308}
2309
2310inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, const v_uint16x8& c,
2312{
2313 __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
2314 __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
2315 __m128i v_c = c.val;
2316 const __m128i shuff0 = _v128_setr_b(0, 1, 2, 3, 16, 17, 4, 5, 6, 7, 18, 19, 8, 9, 10, 11);
2317 const __m128i shuff1 = _v128_setr_b(20, 21, 12, 13, 14, 15, 22, 23, 0, 0, 0, 0, 0, 0, 0, 0);
2318 const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 24, 25, 20, 21);
2319 const __m128i shuff3 = _v128_setr_b(6, 7, 26, 27, 8, 9, 10, 11, 28, 29, 12, 13, 14, 15, 30, 31);
2320 __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
2321
2322 __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
2323 __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
2324 __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
2325 dst1 = __lsx_vshuf_b(abc, dst1, shuff2);
2326
2327 __lsx_vst(dst0, ptr, 0);
2328 __lsx_vst(dst1, ptr, 16);
2329 __lsx_vst(dst2, ptr, 32);
2330}
2331
2332inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, const v_uint32x4& c,
2334{
2335 __m128i v_c = c.val;
2336 __m128i ab_lo = __lsx_vilvl_w(b.val, a.val); //a0 b0 a1 b1
2337 __m128i ab_hi = __lsx_vilvh_w(b.val, a.val); //a2 b2 a3 b3
2338 __m128i bc_od = __lsx_vpackod_w(v_c, b.val); // b1 c1 b3 c3
2339
2340 __m128i dst0 = __lsx_vshuf4i_w(ab_lo, 0xB4); //a0 b0 b1 a1
2341 __m128i dst1 = __lsx_vilvl_d(ab_hi, bc_od); //b1 c1 a2 b2
2342 __m128i dst2 = __lsx_vpermi_w(bc_od, ab_hi, 0xE8); //a2, a3, b3, c3
2343
2344 dst0 = __lsx_vextrins_w(dst0, v_c, 0x20);
2345 dst2 = __lsx_vextrins_w(dst2, v_c, 0x2);
2346 __lsx_vst(dst0, ptr, 0); //a0 b0 c0 a1
2347 __lsx_vst(dst1, ptr, 16); //b1 c1 a2 b2
2348 __lsx_vst(dst2, ptr, 32); //c2 a3 b3 c3
2349}
2350
2351inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
2353{
2354 __m128i dst0 = __lsx_vilvl_d(b.val, a.val);
2355 __m128i dst1 = __lsx_vpermi_w(a.val, c.val, 0xE4);
2356 __m128i dst2 = __lsx_vilvh_d(c.val, b.val);
2357
2358 __lsx_vst(dst0, ptr, 0);
2359 __lsx_vst(dst1, ptr, 16);
2360 __lsx_vst(dst2, ptr, 32);
2361}
2362
2363inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2364 const v_uint8x16& c, const v_uint8x16& d,
2366{
2367 __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
2368 __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
2369 __m128i cd_lo = __lsx_vilvl_b(d.val, c.val);
2370 __m128i cd_hi = __lsx_vilvh_b(d.val, c.val);
2371
2372 __m128i dst0 = __lsx_vilvl_h(cd_lo, ab_lo);
2373 __m128i dst1 = __lsx_vilvh_h(cd_lo, ab_lo);
2374 __m128i dst2 = __lsx_vilvl_h(cd_hi, ab_hi);
2375 __m128i dst3 = __lsx_vilvh_h(cd_hi, ab_hi);
2376
2377 __lsx_vst(dst0, ptr, 0);
2378 __lsx_vst(dst1, ptr, 16);
2379 __lsx_vst(dst2, ptr, 32);
2380 __lsx_vst(dst3, ptr, 48);
2381}
2382
2383inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2384 const v_uint16x8& c, const v_uint16x8& d,
2386{
2387 __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
2388 __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
2389 __m128i cd_lo = __lsx_vilvl_h(d.val, c.val);
2390 __m128i cd_hi = __lsx_vilvh_h(d.val, c.val);
2391
2392 __m128i dst0 = __lsx_vilvl_w(cd_lo, ab_lo);
2393 __m128i dst1 = __lsx_vilvh_w(cd_lo, ab_lo);
2394 __m128i dst2 = __lsx_vilvl_w(cd_hi, ab_hi);
2395 __m128i dst3 = __lsx_vilvh_w(cd_hi, ab_hi);
2396
2397 __lsx_vst(dst0, ptr, 0);
2398 __lsx_vst(dst1, ptr, 16);
2399 __lsx_vst(dst2, ptr, 32);
2400 __lsx_vst(dst3, ptr, 48);
2401}
2402
2403inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2404 const v_uint32x4& c, const v_uint32x4& d,
2406{
2407 __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);
2408 __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);
2409 __m128i cd_lo = __lsx_vilvl_w(d.val, c.val);
2410 __m128i cd_hi = __lsx_vilvh_w(d.val, c.val);
2411
2412 __m128i dst0 = __lsx_vilvl_d(cd_lo, ab_lo);
2413 __m128i dst1 = __lsx_vilvh_d(cd_lo, ab_lo);
2414 __m128i dst2 = __lsx_vilvl_d(cd_hi, ab_hi);
2415 __m128i dst3 = __lsx_vilvh_d(cd_hi, ab_hi);
2416
2417 __lsx_vst(dst0, ptr, 0);
2418 __lsx_vst(dst1, ptr, 16);
2419 __lsx_vst(dst2, ptr, 32);
2420 __lsx_vst(dst3, ptr, 48);
2421}
2422
2423inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b,
2424 const v_uint64x2& c, const v_uint64x2& d,
2426{
2427 __m128i dst0 = __lsx_vilvl_d(b.val, a.val);
2428 __m128i dst2 = __lsx_vilvh_d(b.val, a.val);
2429 __m128i dst1 = __lsx_vilvl_d(d.val, c.val);
2430 __m128i dst3 = __lsx_vilvh_d(d.val, c.val);
2431
2432 __lsx_vst(dst0, ptr, 0);
2433 __lsx_vst(dst1, ptr, 16);
2434 __lsx_vst(dst2, ptr, 32);
2435 __lsx_vst(dst3, ptr, 48);
2436}
2437
2438#define OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2439inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0) \
2440{ \
2441 _Tpvec1 a1, b1; \
2442 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2443 a0 = v_reinterpret_as_##suffix0(a1); \
2444 b0 = v_reinterpret_as_##suffix0(b1); \
2445} \
2446inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0) \
2447{ \
2448 _Tpvec1 a1, b1, c1; \
2449 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2450 a0 = v_reinterpret_as_##suffix0(a1); \
2451 b0 = v_reinterpret_as_##suffix0(b1); \
2452 c0 = v_reinterpret_as_##suffix0(c1); \
2453} \
2454inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, \
2455 _Tpvec0& c0, _Tpvec0& d0) \
2456{ \
2457 _Tpvec1 a1, b1, c1, d1; \
2458 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2459 a0 = v_reinterpret_as_##suffix0(a1); \
2460 b0 = v_reinterpret_as_##suffix0(b1); \
2461 c0 = v_reinterpret_as_##suffix0(c1); \
2462 d0 = v_reinterpret_as_##suffix0(d1); \
2463} \
2464inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2465 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2466{ \
2467 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2468 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2469 v_store_interleave((_Tp1*)ptr, a1, b1); \
2470} \
2471inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0,\
2472 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2473{ \
2474 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2475 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2476 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2477 v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
2478} \
2479inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2480 const _Tpvec0& c0, const _Tpvec0& d0, \
2481 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2482{ \
2483 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2484 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2485 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2486 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2487 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
2488}
2489
2490OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2491OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2492OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2493OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
2494OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2495OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2496
2497//
2498// FP16
2499//
2500
2501inline v_float32x4 v_load_expand(const hfloat* ptr)
2502{
2503#if CV_FP16
2504 return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0)));
2505#else
2506 float CV_DECL_ALIGNED(32) buf[4];
2507 for (int i = 0; i < 4; i++)
2508 buf[i] = (float)ptr[i];
2509 return v_float32x4((__m128)__lsx_vld(buf, 0));
2510#endif
2511}
2512
2513inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
2514{
2515#if CV_FP16
2516 __m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val);
2517 __lsx_vstelm_d(res, ptr, 0, 0);
2518#else
2519 float CV_DECL_ALIGNED(32) buf[4];
2520 v_store_aligned(buf, a);
2521 for (int i = 0; i < 4; i++)
2522 ptr[i] = hfloat(buf[i]);
2523#endif
2524}
2525
2526//
2527// end of FP16
2528//
2529
2530inline void v_cleanup() {}
2531
2532CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2533
2535
2536} // cv::
2537
2538#endif // OPENCV_HAL_INTRIN_LSX_HPP
const int * idx
Definition core_c.h:668
CvArr * dst0
Definition core_c.h:988
const CvArr * vec2
Definition core_c.h:1429
CvArr CvArr CvArr CvArr * dst3
Definition core_c.h:989
CvArr CvArr * dst1
Definition core_c.h:988
const CvArr CvArr * x
Definition core_c.h:1195
CvArr CvArr CvArr * dst2
Definition core_c.h:989
const CvArr const CvArr CvArr * result
Definition core_c.h:1423
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition intrin_cpp.hpp:2216
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition intrin_cpp.hpp:1451
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition intrin_cpp.hpp:2761
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
StoreMode
Definition intrin.hpp:100
@ STORE_UNALIGNED
Definition intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition dualquaternion.inl.hpp:274