EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_msa.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html.
4
5#ifndef OPENCV_HAL_INTRIN_MSA_HPP
6#define OPENCV_HAL_INTRIN_MSA_HPP
7
8#include <algorithm>
9#include "opencv2/core/utility.hpp"
10
11namespace cv
12{
13
15CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
16
17#define CV_SIMD128 1
18
19//MSA implements 128-bit wide vector registers shared with the 64-bit wide floating-point unit registers.
20//MSA and FPU can not be both present, unless the FPU has 64-bit floating-point registers.
21#define CV_SIMD128_64F 1
22
23struct v_uint8x16
24{
25 typedef uchar lane_type;
26 enum { nlanes = 16 };
27
28 v_uint8x16() {}
29 explicit v_uint8x16(v16u8 v) : val(v) {}
30 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
31 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
32 {
33 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
34 val = msa_ld1q_u8(v);
35 }
36
37 uchar get0() const
38 {
39 return msa_getq_lane_u8(val, 0);
40 }
41
42 v16u8 val;
43};
44
45struct v_int8x16
46{
47 typedef schar lane_type;
48 enum { nlanes = 16 };
49
50 v_int8x16() {}
51 explicit v_int8x16(v16i8 v) : val(v) {}
52 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
54 {
55 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56 val = msa_ld1q_s8(v);
57 }
58
59 schar get0() const
60 {
61 return msa_getq_lane_s8(val, 0);
62 }
63
64 v16i8 val;
65};
66
67struct v_uint16x8
68{
69 typedef ushort lane_type;
70 enum { nlanes = 8 };
71
72 v_uint16x8() {}
73 explicit v_uint16x8(v8u16 v) : val(v) {}
74 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
75 {
76 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
77 val = msa_ld1q_u16(v);
78 }
79
80 ushort get0() const
81 {
82 return msa_getq_lane_u16(val, 0);
83 }
84
85 v8u16 val;
86};
87
88struct v_int16x8
89{
90 typedef short lane_type;
91 enum { nlanes = 8 };
92
93 v_int16x8() {}
94 explicit v_int16x8(v8i16 v) : val(v) {}
95 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
96 {
97 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
98 val = msa_ld1q_s16(v);
99 }
100
101 short get0() const
102 {
103 return msa_getq_lane_s16(val, 0);
104 }
105
106 v8i16 val;
107};
108
109struct v_uint32x4
110{
111 typedef unsigned int lane_type;
112 enum { nlanes = 4 };
113
114 v_uint32x4() {}
115 explicit v_uint32x4(v4u32 v) : val(v) {}
116 v_uint32x4(unsigned int v0, unsigned int v1, unsigned int v2, unsigned int v3)
117 {
118 unsigned int v[] = {v0, v1, v2, v3};
119 val = msa_ld1q_u32(v);
120 }
121
122 unsigned int get0() const
123 {
124 return msa_getq_lane_u32(val, 0);
125 }
126
127 v4u32 val;
128};
129
130struct v_int32x4
131{
132 typedef int lane_type;
133 enum { nlanes = 4 };
134
135 v_int32x4() {}
136 explicit v_int32x4(v4i32 v) : val(v) {}
137 v_int32x4(int v0, int v1, int v2, int v3)
138 {
139 int v[] = {v0, v1, v2, v3};
140 val = msa_ld1q_s32(v);
141 }
142
143 int get0() const
144 {
145 return msa_getq_lane_s32(val, 0);
146 }
147
148 v4i32 val;
149};
150
151struct v_float32x4
152{
153 typedef float lane_type;
154 enum { nlanes = 4 };
155
156 v_float32x4() {}
157 explicit v_float32x4(v4f32 v) : val(v) {}
158 v_float32x4(float v0, float v1, float v2, float v3)
159 {
160 float v[] = {v0, v1, v2, v3};
161 val = msa_ld1q_f32(v);
162 }
163
164 float get0() const
165 {
166 return msa_getq_lane_f32(val, 0);
167 }
168
169 v4f32 val;
170};
171
172struct v_uint64x2
173{
174 typedef uint64 lane_type;
175 enum { nlanes = 2 };
176
177 v_uint64x2() {}
178 explicit v_uint64x2(v2u64 v) : val(v) {}
179 v_uint64x2(uint64 v0, uint64 v1)
180 {
181 uint64 v[] = {v0, v1};
182 val = msa_ld1q_u64(v);
183 }
184
185 uint64 get0() const
186 {
187 return msa_getq_lane_u64(val, 0);
188 }
189
190 v2u64 val;
191};
192
193struct v_int64x2
194{
195 typedef int64 lane_type;
196 enum { nlanes = 2 };
197
198 v_int64x2() {}
199 explicit v_int64x2(v2i64 v) : val(v) {}
200 v_int64x2(int64 v0, int64 v1)
201 {
202 int64 v[] = {v0, v1};
203 val = msa_ld1q_s64(v);
204 }
205
206 int64 get0() const
207 {
208 return msa_getq_lane_s64(val, 0);
209 }
210
211 v2i64 val;
212};
213
214struct v_float64x2
215{
216 typedef double lane_type;
217 enum { nlanes = 2 };
218
219 v_float64x2() {}
220 explicit v_float64x2(v2f64 v) : val(v) {}
221 v_float64x2(double v0, double v1)
222 {
223 double v[] = {v0, v1};
224 val = msa_ld1q_f64(v);
225 }
226
227 double get0() const
228 {
229 return msa_getq_lane_f64(val, 0);
230 }
231
232 v2f64 val;
233};
234
235#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
236inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
237inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
238inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
239inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
240inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
241inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \
242inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \
243inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \
244inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \
245inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \
246inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \
247inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); }
248
249OPENCV_HAL_IMPL_MSA_INIT(uint8x16, uchar, u8)
250OPENCV_HAL_IMPL_MSA_INIT(int8x16, schar, s8)
251OPENCV_HAL_IMPL_MSA_INIT(uint16x8, ushort, u16)
252OPENCV_HAL_IMPL_MSA_INIT(int16x8, short, s16)
253OPENCV_HAL_IMPL_MSA_INIT(uint32x4, unsigned int, u32)
254OPENCV_HAL_IMPL_MSA_INIT(int32x4, int, s32)
255OPENCV_HAL_IMPL_MSA_INIT(uint64x2, uint64, u64)
256OPENCV_HAL_IMPL_MSA_INIT(int64x2, int64, s64)
257OPENCV_HAL_IMPL_MSA_INIT(float32x4, float, f32)
258OPENCV_HAL_IMPL_MSA_INIT(float64x2, double, f64)
259
260#define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \
261inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
262{ \
263 return _Tpvec(mov(a.val, b.val)); \
264} \
265template<int n> inline \
266_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
267{ \
268 return _Tpvec(rshr(a.val, b.val, n)); \
269}
270
271OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_uint16x8, pack, msa_qpack_u16, msa_qrpackr_u16)
272OPENCV_HAL_IMPL_MSA_PACK(v_int8x16, v_int16x8, pack, msa_qpack_s16, msa_qrpackr_s16)
273OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_uint32x4, pack, msa_qpack_u32, msa_qrpackr_u32)
274OPENCV_HAL_IMPL_MSA_PACK(v_int16x8, v_int32x4, pack, msa_qpack_s32, msa_qrpackr_s32)
275OPENCV_HAL_IMPL_MSA_PACK(v_uint32x4, v_uint64x2, pack, msa_pack_u64, msa_rpackr_u64)
276OPENCV_HAL_IMPL_MSA_PACK(v_int32x4, v_int64x2, pack, msa_pack_s64, msa_rpackr_s64)
277OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_int16x8, pack_u, msa_qpacku_s16, msa_qrpackru_s16)
278OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_int32x4, pack_u, msa_qpacku_s32, msa_qrpackru_s32)
279
280#define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
281inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
282{ \
283 hreg a1 = mov(a.val); \
284 msa_st1_##suffix(ptr, a1); \
285} \
286template<int n> inline \
287void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
288{ \
289 hreg a1 = rshr(a.val, n); \
290 msa_st1_##suffix(ptr, a1); \
291}
292
293OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_uint16x8, pack, msa_qmovn_u16, msa_qrshrn_n_u16)
294OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int8x16, schar, v8i8, s8, v_int16x8, pack, msa_qmovn_s16, msa_qrshrn_n_s16)
295OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_uint32x4, pack, msa_qmovn_u32, msa_qrshrn_n_u32)
296OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int16x8, short, v4i16, s16, v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32)
297OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint32x4, unsigned, v2u32, u32, v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64)
298OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int32x4, int, v2i32, s32, v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64)
299OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_int16x8, pack_u, msa_qmovun_s16, msa_qrshrun_n_s16)
300OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_int32x4, pack_u, msa_qmovun_s32, msa_qrshrun_n_s32)
301
302// pack boolean
303inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
304{
305 return v_uint8x16(msa_pack_u16(a.val, b.val));
306}
307
308inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
309 const v_uint32x4& c, const v_uint32x4& d)
310{
311 return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val)));
312}
313
314inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
315 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
316 const v_uint64x2& g, const v_uint64x2& h)
317{
318 v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val));
319 v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val));
320 return v_uint8x16(msa_pack_u16(abcd, efgh));
321}
322
323inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
324 const v_float32x4& m1, const v_float32x4& m2,
325 const v_float32x4& m3)
326{
327 v4f32 v0 = v.val;
328 v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
329 res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
330 res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
331 res = msa_mlaq_lane_f32(res, m3.val, v0, 3);
332 return v_float32x4(res);
333}
334
335inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
336 const v_float32x4& m1, const v_float32x4& m2,
337 const v_float32x4& a)
338{
339 v4f32 v0 = v.val;
340 v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
341 res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
342 res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
343 res = msa_addq_f32(res, a.val);
344 return v_float32x4(res);
345}
346
347#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
348inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
349{ \
350 return _Tpvec(intrin(a.val, b.val)); \
351} \
352inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
353{ \
354 a.val = intrin(a.val, b.val); \
355 return a; \
356}
357
358OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
359OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
360OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
361OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
362OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
363OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
364OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
365OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
366OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
367OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
368OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
369OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
370OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
371OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
372OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
373OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
374OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
375OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
376OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
377OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
378OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
379OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
380OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
381OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
382OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
383OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
384
385// saturating multiply 8-bit, 16-bit
386#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \
387inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
388{ \
389 _Tpwvec c, d; \
390 v_mul_expand(a, b, c, d); \
391 return v_pack(c, d); \
392} \
393inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
394{a = a * b; return a; }
395
396OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8)
397OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
398OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int16x8, v_int32x4)
399OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint16x8, v_uint32x4)
400
401// Multiply and expand
402inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
403 v_int16x8& c, v_int16x8& d)
404{
405 v16i8 a_lo, a_hi, b_lo, b_hi;
406
407 ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi);
408 ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi);
409 c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo));
410 d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi));
411}
412
413inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
414 v_uint16x8& c, v_uint16x8& d)
415{
416 v16u8 a_lo, a_hi, b_lo, b_hi;
417
418 ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi);
419 ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi);
420 c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo));
421 d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi));
422}
423
424inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
425 v_int32x4& c, v_int32x4& d)
426{
427 v8i16 a_lo, a_hi, b_lo, b_hi;
428
429 ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
430 ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
431 c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo));
432 d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi));
433}
434
435inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
436 v_uint32x4& c, v_uint32x4& d)
437{
438 v8u16 a_lo, a_hi, b_lo, b_hi;
439
440 ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
441 ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
442 c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo));
443 d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi));
444}
445
446inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
447 v_uint64x2& c, v_uint64x2& d)
448{
449 v4u32 a_lo, a_hi, b_lo, b_hi;
450
451 ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi);
452 ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi);
453 c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo));
454 d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi));
455}
456
457inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
458{
459 v8i16 a_lo, a_hi, b_lo, b_hi;
460
461 ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
462 ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
463
464 return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)),
465 msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16));
466}
467
468inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
469{
470 v8u16 a_lo, a_hi, b_lo, b_hi;
471
472 ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
473 ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
474
475 return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)),
476 msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
477}
478
480
481// 16 >> 32
482inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
483{ return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
484inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
485{ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
486
487// 32 >> 64
488inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
489{ return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
490inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
491{ return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
492
493// 8 >> 32
494inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
495{
496 v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
497 v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
498 v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
499 v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
500 v4u32 prod = msa_dotp_u_w(even_a, even_b);
501 return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
502}
503inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
504{
505 v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
506 v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
507 v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
508 v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
509 v4u32 prod = msa_dpadd_u_w(c.val, even_a, even_b);
510 return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
511}
512
513inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
514{
515 v8i16 prod = msa_dotp_s_h(a.val, b.val);
516 return v_int32x4(msa_hadd_s32(prod, prod));
517}
518inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
519 const v_int32x4& c)
520{ return v_dotprod_expand(a, b) + c; }
521
522// 16 >> 64
523inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
524{
525 v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
526 v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
527 v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
528 v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
529 v2u64 prod = msa_dotp_u_d(even_a, even_b);
530 return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
531}
532inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
533 const v_uint64x2& c)
534{
535 v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
536 v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
537 v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
538 v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
539 v2u64 prod = msa_dpadd_u_d(c.val, even_a, even_b);
540 return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
541}
542
543inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
544{
545 v4i32 prod = msa_dotp_s_w(a.val, b.val);
546 return v_int64x2(msa_hadd_s64(prod, prod));
547}
548inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
549{ return v_dotprod_expand(a, b) + c; }
550
551// 32 >> 64f
552inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
553{ return v_cvt_f64(v_dotprod(a, b)); }
554inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
555{ return v_dotprod_expand(a, b) + c; }
556
557
559
560// 16 >> 32
561inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
562{ return v_dotprod(a, b); }
563inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
564{ return v_dotprod(a, b, c); }
565
566// 32 >> 64
567inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
568{ return v_dotprod(a, b); }
569inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
570{ return v_dotprod(a, b, c); }
571
572// 8 >> 32
573inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
574{ return v_dotprod_expand(a, b); }
575inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
576{ return v_dotprod_expand(a, b, c); }
577inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
578{ return v_dotprod_expand(a, b); }
579inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
580{ return v_dotprod_expand(a, b, c); }
581
582// 16 >> 64
583inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
584{ return v_dotprod_expand(a, b); }
585inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
586{ return v_dotprod_expand(a, b, c); }
587inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
588{ return v_dotprod_expand(a, b); }
589inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
590{ return v_dotprod_expand(a, b, c); }
591
592// 32 >> 64f
593inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
594{ return v_dotprod_expand(a, b); }
595inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
596{ return v_dotprod_expand(a, b, c); }
597
598#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
599OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \
600OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \
601OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \
602inline _Tpvec operator ~ (const _Tpvec& a) \
603{ \
604 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
605}
606
607OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint8x16, v16u8, u8)
608OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int8x16, v16i8, s8)
609OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint16x8, v8u16, u16)
610OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int16x8, v8i16, s16)
611OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint32x4, v4u32, u32)
612OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int32x4, v4i32, s32)
613OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
614OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
615
616#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
617inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
618{ \
619 return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
620} \
621inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
622{ \
623 a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
624 return a; \
625}
626
627OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
628OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
629OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
630
631inline v_float32x4 operator ~ (const v_float32x4& a)
632{
633 return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
634}
635
636/* v_abs */
637#define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
638inline _Tpuvec v_abs(const _Tpsvec& a) \
639{ \
640 return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \
641}
642
643OPENCV_HAL_IMPL_MSA_ABS(v_uint8x16, v_int8x16, u8, s8)
644OPENCV_HAL_IMPL_MSA_ABS(v_uint16x8, v_int16x8, u16, s16)
645OPENCV_HAL_IMPL_MSA_ABS(v_uint32x4, v_int32x4, u32, s32)
646
647/* v_abs(float), v_sqrt, v_invsqrt */
648#define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \
649inline _Tpvec func(const _Tpvec& a) \
650{ \
651 return _Tpvec(intrin(a.val)); \
652}
653
654OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_abs, msa_absq_f32)
655OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_abs, msa_absq_f64)
656OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_sqrt, msa_sqrtq_f32)
657OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_invsqrt, msa_rsqrtq_f32)
658OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
659OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
660
661#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
662inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
663{ \
664 return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
665} \
666inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
667{ \
668 a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
669 return a; \
670}
671
672OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
673OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
674OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
675
676inline v_float64x2 operator ~ (const v_float64x2& a)
677{
678 return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
679}
680
681// TODO: exp, log, sin, cos
682
683#define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \
684inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
685{ \
686 return _Tpvec(intrin(a.val, b.val)); \
687}
688
689OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_min, msa_minq_u8)
690OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_max, msa_maxq_u8)
691OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_min, msa_minq_s8)
692OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_max, msa_maxq_s8)
693OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_min, msa_minq_u16)
694OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_max, msa_maxq_u16)
695OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_min, msa_minq_s16)
696OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_max, msa_maxq_s16)
697OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_min, msa_minq_u32)
698OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_max, msa_maxq_u32)
699OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_min, msa_minq_s32)
700OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_max, msa_maxq_s32)
701OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_min, msa_minq_f32)
702OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_max, msa_maxq_f32)
703OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
704OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
705
706#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
707inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
708{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
709inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
710{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
711inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
712{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
713inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
714{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
715inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
716{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
717inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
718{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
719
720OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
721OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int8x16, v16i8, s8, u8)
722OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint16x8, v8u16, u16, u16)
723OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int16x8, v8i16, s16, u16)
724OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint32x4, v4u32, u32, u32)
725OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int32x4, v4i32, s32, u32)
726OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float32x4, v4f32, f32, u32)
727OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint64x2, v2u64, u64, u64)
728OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int64x2, v2i64, s64, u64)
729OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float64x2, v2f64, f64, u64)
730
731inline v_float32x4 v_not_nan(const v_float32x4& a)
732{ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); }
733inline v_float64x2 v_not_nan(const v_float64x2& a)
734{ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); }
735
736OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_add_wrap, msa_addq_u8)
737OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_add_wrap, msa_addq_s8)
738OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_add_wrap, msa_addq_u16)
739OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_add_wrap, msa_addq_s16)
740OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_sub_wrap, msa_subq_u8)
741OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_sub_wrap, msa_subq_s8)
742OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_sub_wrap, msa_subq_u16)
743OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_sub_wrap, msa_subq_s16)
744OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_mul_wrap, msa_mulq_u8)
745OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_mul_wrap, msa_mulq_s8)
746OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_mul_wrap, msa_mulq_u16)
747OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_mul_wrap, msa_mulq_s16)
748
749OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_absdiff, msa_abdq_u8)
750OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_absdiff, msa_abdq_u16)
751OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_absdiff, msa_abdq_u32)
752OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_absdiff, msa_abdq_f32)
753OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_absdiff, msa_abdq_f64)
754
755
756OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_absdiffs, msa_qabdq_s8)
757OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_absdiffs, msa_qabdq_s16)
758
759#define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \
760inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
761{ \
762 return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \
763}
764
765OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int8x16, v_uint8x16, v16u8, v_absdiff, msa_abdq_s8)
766OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int16x8, v_uint16x8, v8u16, v_absdiff, msa_abdq_s16)
767OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int32x4, v_uint32x4, v4u32, v_absdiff, msa_abdq_s32)
768
769/* v_magnitude, v_sqr_magnitude, v_fma, v_muladd */
770inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
771{
772 v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
773 return v_sqrt(x);
774}
775
776inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
777{
778 return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
779}
780
781inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
782{
783 return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val));
784}
785
786inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
787{
788 return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val));
789}
790
791inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
792{
793 return v_fma(a, b, c);
794}
795
796inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
797{
798 return v_fma(a, b, c);
799}
800
801inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
802{
803 v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
804 return v_sqrt(x);
805}
806
807inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
808{
809 return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
810}
811
812inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
813{
814 return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val));
815}
816
817inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
818{
819 return v_fma(a, b, c);
820}
821
822// trade efficiency for convenience
823#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
824inline _Tpvec operator << (const _Tpvec& a, int n) \
825{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
826inline _Tpvec operator >> (const _Tpvec& a, int n) \
827{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
828template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
829{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
830template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
831{ return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \
832template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
833{ return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); }
834
835OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint8x16, u8, schar, s8)
836OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int8x16, s8, schar, s8)
837OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint16x8, u16, short, s16)
838OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int16x8, s16, short, s16)
839OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint32x4, u32, int, s32)
840OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int32x4, s32, int, s32)
841OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint64x2, u64, int64, s64)
842OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int64x2, s64, int64, s64)
843
844/* v_rotate_right, v_rotate_left */
845#define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \
846template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
847{ \
848 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \
849} \
850template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
851{ \
852 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
853} \
854template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
855{ \
856 return a; \
857} \
858template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
859{ \
860 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \
861} \
862template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
863{ \
864 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
865} \
866template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
867{ \
868 CV_UNUSED(b); \
869 return a; \
870}
871
872OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint8x16, v16u8, v16i8, s8)
873OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int8x16, v16i8, v16i8, s8)
874OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint16x8, v8u16, v8i16, s16)
875OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int16x8, v8i16, v8i16, s16)
876OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint32x4, v4u32, v4i32, s32)
877OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int32x4, v4i32, v4i32, s32)
878OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float32x4, v4f32, v4i32, s32)
879OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint64x2, v2u64, v2i64, s64)
880OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int64x2, v2i64, v2i64, s64)
881OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float64x2, v2f64, v2i64, s64)
882
883#define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
884inline _Tpvec v_load(const _Tp* ptr) \
885{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
886inline _Tpvec v_load_aligned(const _Tp* ptr) \
887{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
888inline _Tpvec v_load_low(const _Tp* ptr) \
889{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \
890inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
891{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \
892inline void v_store(_Tp* ptr, const _Tpvec& a) \
893{ msa_st1q_##suffix(ptr, a.val); } \
894inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
895{ msa_st1q_##suffix(ptr, a.val); } \
896inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
897{ msa_st1q_##suffix(ptr, a.val); } \
898inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
899{ msa_st1q_##suffix(ptr, a.val); } \
900inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
901{ \
902 int n = _Tpvec::nlanes; \
903 for( int i = 0; i < (n/2); i++ ) \
904 ptr[i] = a.val[i]; \
905} \
906inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
907{ \
908 int n = _Tpvec::nlanes; \
909 for( int i = 0; i < (n/2); i++ ) \
910 ptr[i] = a.val[i+(n/2)]; \
911}
912
913OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint8x16, uchar, u8)
914OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int8x16, schar, s8)
915OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint16x8, ushort, u16)
916OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int16x8, short, s16)
917OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint32x4, unsigned, u32)
918OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int32x4, int, s32)
919OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint64x2, uint64, u64)
920OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64)
921OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32)
922OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64)
923
924
925
926inline v_uint8x16 v_reverse(const v_uint8x16 &a)
927{
928 v_uint8x16 c = v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
929 return c;
930}
931
932inline v_int8x16 v_reverse(const v_int8x16 &a)
933{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
934
935inline v_uint16x8 v_reverse(const v_uint16x8 &a)
936{
937 v_uint16x8 c = v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
938 return c;
939}
940
941inline v_int16x8 v_reverse(const v_int16x8 &a)
942{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
943
944inline v_uint32x4 v_reverse(const v_uint32x4 &a)
945{
946 v_uint32x4 c;
947 c.val[0] = a.val[3];
948 c.val[1] = a.val[2];
949 c.val[2] = a.val[1];
950 c.val[3] = a.val[0];
951 return c;
952}
953
954inline v_int32x4 v_reverse(const v_int32x4 &a)
955{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
956
957inline v_float32x4 v_reverse(const v_float32x4 &a)
958{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
959
960inline v_uint64x2 v_reverse(const v_uint64x2 &a)
961{
962 v_uint64x2 c;
963 c.val[0] = a.val[1];
964 c.val[1] = a.val[0];
965 return c;
966}
967
968inline v_int64x2 v_reverse(const v_int64x2 &a)
969{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
970
971inline v_float64x2 v_reverse(const v_float64x2 &a)
972{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
973
974
975#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
976inline unsigned short v_reduce_##func(const v_uint16x8& a) \
977{ \
978 v8u16 a_lo, a_hi; \
979 ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \
980 v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \
981 v4u32 b_lo, b_hi; \
982 ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \
983 v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \
984 return (unsigned short)cfunc(c[0], c[1]); \
985}
986
987OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(max, std::max)
988OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(min, std::min)
989
990#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \
991inline short v_reduce_##func(const v_int16x8& a) \
992{ \
993 v8i16 a_lo, a_hi; \
994 ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \
995 v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \
996 v4i32 b_lo, b_hi; \
997 ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \
998 v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \
999 return (short)cfunc(c[0], c[1]); \
1000}
1001
1002OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(max, std::max)
1003OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(min, std::min)
1004
1005#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \
1006inline scalartype v_reduce_##func(const _Tpvec& a) \
1007{ \
1008 return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \
1009}
1010
1011OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1012OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1013OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, max, std::max)
1014OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, min, std::min)
1015OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, max, std::max)
1016OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, min, std::min)
1017
1018
1019#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(_Tpvec, scalartype, _Tpvec2, func) \
1020inline scalartype v_reduce_##func(const _Tpvec& a) \
1021{ \
1022 _Tpvec2 a1, a2; \
1023 v_expand(a, a1, a2); \
1024 return (scalartype)v_reduce_##func(v_##func(a1, a2)); \
1025}
1026
1027OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, min)
1028OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, max)
1029OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, min)
1030OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, max)
1031
1032
1033
1034#define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \
1035inline scalartype v_reduce_sum(const _Tpvec& a) \
1036{ \
1037 return (scalartype)msa_sum_##suffix(a.val); \
1038}
1039
1040OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned short, u8)
1041OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, short, s8)
1042OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned, u16)
1043OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, int, s16)
1044OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, uint64_t, u32)
1045OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int64_t, s32)
1046OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_float32x4, float, f32)
1047
1048inline uint64 v_reduce_sum(const v_uint64x2& a)
1049{ return (uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); }
1050inline int64 v_reduce_sum(const v_int64x2& a)
1051{ return (int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); }
1052inline double v_reduce_sum(const v_float64x2& a)
1053{
1054 return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1);
1055}
1056
1057/* v_reduce_sum4, v_reduce_sad */
1058inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1059 const v_float32x4& c, const v_float32x4& d)
1060{
1061 v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))),
1062 MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val)))); // a0+a1 b0+b1 a2+a3 b2+b3
1063 v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))),
1064 MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val)))); // c0+c1 d0+d1 c2+c3 d2+d3
1065
1066 return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))),
1067 MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0)))));
1068}
1069
1070inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1071{
1072 v16u8 t0 = msa_abdq_u8(a.val, b.val);
1073 v8u16 t1 = msa_paddlq_u8(t0);
1074 v4u32 t2 = msa_paddlq_u16(t1);
1075 return msa_sum_u32(t2);
1076}
1077inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1078{
1079 v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val));
1080 v8u16 t1 = msa_paddlq_u8(t0);
1081 v4u32 t2 = msa_paddlq_u16(t1);
1082 return msa_sum_u32(t2);
1083}
1084inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1085{
1086 v8u16 t0 = msa_abdq_u16(a.val, b.val);
1087 v4u32 t1 = msa_paddlq_u16(t0);
1088 return msa_sum_u32(t1);
1089}
1090inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1091{
1092 v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val));
1093 v4u32 t1 = msa_paddlq_u16(t0);
1094 return msa_sum_u32(t1);
1095}
1096inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1097{
1098 v4u32 t0 = msa_abdq_u32(a.val, b.val);
1099 return msa_sum_u32(t0);
1100}
1101inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1102{
1103 v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val));
1104 return msa_sum_u32(t0);
1105}
1106inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1107{
1108 v4f32 t0 = msa_abdq_f32(a.val, b.val);
1109 return msa_sum_f32(t0);
1110}
1111
1112/* v_popcount */
1113#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \
1114inline v_uint8x16 v_popcount(const _Tpvec& a) \
1115{ \
1116 v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \
1117 return v_uint8x16(t); \
1118}
1119OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_uint8x16)
1120OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_int8x16)
1121
1122#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \
1123inline v_uint16x8 v_popcount(const _Tpvec& a) \
1124{ \
1125 v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \
1126 return v_uint16x8(t); \
1127}
1128OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_uint16x8)
1129OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_int16x8)
1130
1131#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \
1132inline v_uint32x4 v_popcount(const _Tpvec& a) \
1133{ \
1134 v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \
1135 return v_uint32x4(t); \
1136}
1137OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_uint32x4)
1138OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_int32x4)
1139
1140#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \
1141inline v_uint64x2 v_popcount(const _Tpvec& a) \
1142{ \
1143 v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \
1144 return v_uint64x2(t); \
1145}
1146OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_uint64x2)
1147OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_int64x2)
1148
1149inline int v_signmask(const v_uint8x16& a)
1150{
1151 v8i8 m0 = msa_create_s8(CV_BIG_UINT(0x0706050403020100));
1152 v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0));
1153 v8u16 v1 = msa_paddlq_u8(v0);
1154 v4u32 v2 = msa_paddlq_u16(v1);
1155 v2u64 v3 = msa_paddlq_u32(v2);
1156 return (int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8);
1157}
1158inline int v_signmask(const v_int8x16& a)
1159{ return v_signmask(v_reinterpret_as_u8(a)); }
1160
1161inline int v_signmask(const v_uint16x8& a)
1162{
1163 v4i16 m0 = msa_create_s16(CV_BIG_UINT(0x0003000200010000));
1164 v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0));
1165 v4u32 v1 = msa_paddlq_u16(v0);
1166 v2u64 v2 = msa_paddlq_u32(v1);
1167 return (int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4);
1168}
1169inline int v_signmask(const v_int16x8& a)
1170{ return v_signmask(v_reinterpret_as_u16(a)); }
1171
1172inline int v_signmask(const v_uint32x4& a)
1173{
1174 v2i32 m0 = msa_create_s32(CV_BIG_UINT(0x0000000100000000));
1175 v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0));
1176 v2u64 v1 = msa_paddlq_u32(v0);
1177 return (int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2);
1178}
1179inline int v_signmask(const v_int32x4& a)
1180{ return v_signmask(v_reinterpret_as_u32(a)); }
1181inline int v_signmask(const v_float32x4& a)
1182{ return v_signmask(v_reinterpret_as_u32(a)); }
1183
1184inline int v_signmask(const v_uint64x2& a)
1185{
1186 v2u64 v0 = msa_shrq_n_u64(a.val, 63);
1187 return (int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1);
1188}
1189inline int v_signmask(const v_int64x2& a)
1190{ return v_signmask(v_reinterpret_as_u64(a)); }
1191inline int v_signmask(const v_float64x2& a)
1192{ return v_signmask(v_reinterpret_as_u64(a)); }
1193
1194inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
1195inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
1196inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
1197inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
1198inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
1199inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
1200inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1201inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1202inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1203inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1204
1205#define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \
1206inline bool v_check_all(const v_##_Tpvec& a) \
1207{ \
1208 _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \
1209 v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
1210 return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \
1211} \
1212inline bool v_check_any(const v_##_Tpvec& a) \
1213{ \
1214 _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \
1215 v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
1216 return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \
1217}
1218
1219OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7)
1220OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15)
1221OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31)
1222OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63)
1223
1224inline bool v_check_all(const v_int8x16& a)
1225{ return v_check_all(v_reinterpret_as_u8(a)); }
1226inline bool v_check_all(const v_int16x8& a)
1227{ return v_check_all(v_reinterpret_as_u16(a)); }
1228inline bool v_check_all(const v_int32x4& a)
1229{ return v_check_all(v_reinterpret_as_u32(a)); }
1230inline bool v_check_all(const v_float32x4& a)
1231{ return v_check_all(v_reinterpret_as_u32(a)); }
1232
1233inline bool v_check_any(const v_int8x16& a)
1234{ return v_check_any(v_reinterpret_as_u8(a)); }
1235inline bool v_check_any(const v_int16x8& a)
1236{ return v_check_any(v_reinterpret_as_u16(a)); }
1237inline bool v_check_any(const v_int32x4& a)
1238{ return v_check_any(v_reinterpret_as_u32(a)); }
1239inline bool v_check_any(const v_float32x4& a)
1240{ return v_check_any(v_reinterpret_as_u32(a)); }
1241
1242inline bool v_check_all(const v_int64x2& a)
1243{ return v_check_all(v_reinterpret_as_u64(a)); }
1244inline bool v_check_all(const v_float64x2& a)
1245{ return v_check_all(v_reinterpret_as_u64(a)); }
1246inline bool v_check_any(const v_int64x2& a)
1247{ return v_check_any(v_reinterpret_as_u64(a)); }
1248inline bool v_check_any(const v_float64x2& a)
1249{ return v_check_any(v_reinterpret_as_u64(a)); }
1250
1251/* v_select */
1252#define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \
1253inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1254{ \
1255 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \
1256 MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \
1257}
1258
1259OPENCV_HAL_IMPL_MSA_SELECT(v_uint8x16, v16u8, v16u8)
1260OPENCV_HAL_IMPL_MSA_SELECT(v_int8x16, v16i8, v16u8)
1261OPENCV_HAL_IMPL_MSA_SELECT(v_uint16x8, v8u16, v16u8)
1262OPENCV_HAL_IMPL_MSA_SELECT(v_int16x8, v8i16, v16u8)
1263OPENCV_HAL_IMPL_MSA_SELECT(v_uint32x4, v4u32, v16u8)
1264OPENCV_HAL_IMPL_MSA_SELECT(v_int32x4, v4i32, v16u8)
1265OPENCV_HAL_IMPL_MSA_SELECT(v_float32x4, v4f32, v16u8)
1266OPENCV_HAL_IMPL_MSA_SELECT(v_float64x2, v2f64, v16u8)
1267
1268#define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \
1269inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1270{ \
1271 _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1272 _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1273 b0.val = msa_paddlq_##suffix(a_lo); \
1274 b1.val = msa_paddlq_##suffix(a_hi); \
1275} \
1276inline _Tpwvec v_expand_low(const _Tpvec& a) \
1277{ \
1278 _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1279 return _Tpwvec(msa_paddlq_##suffix(a_lo)); \
1280} \
1281inline _Tpwvec v_expand_high(const _Tpvec& a) \
1282{ \
1283 _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1284 return _Tpwvec(msa_paddlq_##suffix(a_hi)); \
1285} \
1286inline _Tpwvec v_load_expand(const _Tp* ptr) \
1287{ \
1288 return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \
1289}
1290
1291OPENCV_HAL_IMPL_MSA_EXPAND(v_uint8x16, v_uint16x8, uchar, u8, s8, v16u8, v16i8)
1292OPENCV_HAL_IMPL_MSA_EXPAND(v_int8x16, v_int16x8, schar, s8, s8, v16i8, v16i8)
1293OPENCV_HAL_IMPL_MSA_EXPAND(v_uint16x8, v_uint32x4, ushort, u16, s16, v8u16, v8i16)
1294OPENCV_HAL_IMPL_MSA_EXPAND(v_int16x8, v_int32x4, short, s16, s16, v8i16, v8i16)
1295OPENCV_HAL_IMPL_MSA_EXPAND(v_uint32x4, v_uint64x2, uint, u32, s32, v4u32, v4i32)
1296OPENCV_HAL_IMPL_MSA_EXPAND(v_int32x4, v_int64x2, int, s32, s32, v4i32, v4i32)
1297
1298inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1299{
1300 return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]});
1301}
1302
1303inline v_int32x4 v_load_expand_q(const schar* ptr)
1304{
1305 return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]});
1306}
1307
1308/* v_zip, v_combine_low, v_combine_high, v_recombine */
1309#define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \
1310inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1311{ \
1312 b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1313 b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1314} \
1315inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1316{ \
1317 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
1318} \
1319inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1320{ \
1321 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
1322} \
1323inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1324{ \
1325 c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
1326 d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
1327}
1328
1329OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint8x16, v16u8, v16i8, s8)
1330OPENCV_HAL_IMPL_MSA_UNPACKS(v_int8x16, v16i8, v16i8, s8)
1331OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint16x8, v8u16, v8i16, s16)
1332OPENCV_HAL_IMPL_MSA_UNPACKS(v_int16x8, v8i16, v8i16, s16)
1333OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint32x4, v4u32, v4i32, s32)
1334OPENCV_HAL_IMPL_MSA_UNPACKS(v_int32x4, v4i32, v4i32, s32)
1335OPENCV_HAL_IMPL_MSA_UNPACKS(v_float32x4, v4f32, v4i32, s32)
1336OPENCV_HAL_IMPL_MSA_UNPACKS(v_float64x2, v2f64, v2i64, s64)
1337
1338/* v_extract */
1339#define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \
1340template <int s> \
1341inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1342{ \
1343 return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \
1344}
1345
1346OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint8x16, v16u8, v16i8, s8)
1347OPENCV_HAL_IMPL_MSA_EXTRACT(v_int8x16, v16i8, v16i8, s8)
1348OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint16x8, v8u16, v8i16, s16)
1349OPENCV_HAL_IMPL_MSA_EXTRACT(v_int16x8, v8i16, v8i16, s16)
1350OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint32x4, v4u32, v4i32, s32)
1351OPENCV_HAL_IMPL_MSA_EXTRACT(v_int32x4, v4i32, v4i32, s32)
1352OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint64x2, v2u64, v2i64, s64)
1353OPENCV_HAL_IMPL_MSA_EXTRACT(v_int64x2, v2i64, v2i64, s64)
1354OPENCV_HAL_IMPL_MSA_EXTRACT(v_float32x4, v4f32, v4i32, s32)
1355OPENCV_HAL_IMPL_MSA_EXTRACT(v_float64x2, v2f64, v2i64, s64)
1356
1357/* v_round, v_floor, v_ceil, v_trunc */
1358inline v_int32x4 v_round(const v_float32x4& a)
1359{
1360 return v_int32x4(msa_cvttintq_s32_f32(a.val));
1361}
1362
1363inline v_int32x4 v_floor(const v_float32x4& a)
1364{
1365 v4i32 a1 = msa_cvttintq_s32_f32(a.val);
1366 return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val))));
1367}
1368
1369inline v_int32x4 v_ceil(const v_float32x4& a)
1370{
1371 v4i32 a1 = msa_cvttintq_s32_f32(a.val);
1372 return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1)))));
1373}
1374
1375inline v_int32x4 v_trunc(const v_float32x4& a)
1376{
1377 return v_int32x4(msa_cvttruncq_s32_f32(a.val));
1378}
1379
1380inline v_int32x4 v_round(const v_float64x2& a)
1381{
1382 return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0)));
1383}
1384
1385inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1386{
1387 return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val)));
1388}
1389
1390inline v_int32x4 v_floor(const v_float64x2& a)
1391{
1392 v2f64 a1 = msa_cvtrintq_f64(a.val);
1393 return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0)));
1394}
1395
1396inline v_int32x4 v_ceil(const v_float64x2& a)
1397{
1398 v2f64 a1 = msa_cvtrintq_f64(a.val);
1399 return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0)));
1400}
1401
1402inline v_int32x4 v_trunc(const v_float64x2& a)
1403{
1404 return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0)));
1405}
1406
1407#define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \
1408inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1409 const _Tpvec& a2, const _Tpvec& a3, \
1410 _Tpvec& b0, _Tpvec& b1, \
1411 _Tpvec& b2, _Tpvec& b3) \
1412{ \
1413 _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1414 _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1415 _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
1416 _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
1417 b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
1418 b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
1419 b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
1420 b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
1421}
1422
1423OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_uint32x4, v4u32, v4i32, s32)
1424OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_int32x4, v4i32, v4i32, s32)
1425OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_float32x4, v4f32, v4i32, s32)
1426
1427#define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \
1428inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1429{ \
1430 msa_ld2q_##suffix(ptr, &a.val, &b.val); \
1431} \
1432inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1433{ \
1434 msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \
1435} \
1436inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1437 v_##_Tpvec& c, v_##_Tpvec& d) \
1438{ \
1439 msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \
1440} \
1441inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1442 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1443{ \
1444 msa_st2q_##suffix(ptr, a.val, b.val); \
1445} \
1446inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1447 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1448{ \
1449 msa_st3q_##suffix(ptr, a.val, b.val, c.val); \
1450} \
1451inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1452 const v_##_Tpvec& c, const v_##_Tpvec& d, \
1453 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
1454{ \
1455 msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \
1456}
1457
1458OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, uchar, u8)
1459OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, schar, s8)
1460OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, ushort, u16)
1461OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8, short, s16)
1462OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4, unsigned, u32)
1463OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4, int, s32)
1464OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4, float, f32)
1465OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, uint64, u64)
1466OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, int64, s64)
1467OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2, double, f64)
1468
1469/* v_cvt_f32, v_cvt_f64, v_cvt_f64_high */
1470inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1471{
1472 return v_float32x4(msa_cvtfintq_f32_s32(a.val));
1473}
1474
1475inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1476{
1477 return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f)));
1478}
1479
1480inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1481{
1482 return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val));
1483}
1484
1485inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1486{
1487 return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
1488}
1489
1490inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1491{
1492 return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
1493}
1494
1495inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1496{
1497 return v_float64x2(msa_cvtflq_f64_f32(a.val));
1498}
1499
1500inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1501{
1502 return v_float64x2(msa_cvtfhq_f64_f32(a.val));
1503}
1504
1505inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1506{
1507 return v_float64x2(msa_cvtfintq_f64_s64(a.val));
1508}
1509
1511inline v_int8x16 v_lut(const schar* tab, const int* idx)
1512{
1513 schar CV_DECL_ALIGNED(32) elems[16] =
1514 {
1515 tab[idx[ 0]],
1516 tab[idx[ 1]],
1517 tab[idx[ 2]],
1518 tab[idx[ 3]],
1519 tab[idx[ 4]],
1520 tab[idx[ 5]],
1521 tab[idx[ 6]],
1522 tab[idx[ 7]],
1523 tab[idx[ 8]],
1524 tab[idx[ 9]],
1525 tab[idx[10]],
1526 tab[idx[11]],
1527 tab[idx[12]],
1528 tab[idx[13]],
1529 tab[idx[14]],
1530 tab[idx[15]]
1531 };
1532 return v_int8x16(msa_ld1q_s8(elems));
1533}
1534inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1535{
1536 schar CV_DECL_ALIGNED(32) elems[16] =
1537 {
1538 tab[idx[0]],
1539 tab[idx[0] + 1],
1540 tab[idx[1]],
1541 tab[idx[1] + 1],
1542 tab[idx[2]],
1543 tab[idx[2] + 1],
1544 tab[idx[3]],
1545 tab[idx[3] + 1],
1546 tab[idx[4]],
1547 tab[idx[4] + 1],
1548 tab[idx[5]],
1549 tab[idx[5] + 1],
1550 tab[idx[6]],
1551 tab[idx[6] + 1],
1552 tab[idx[7]],
1553 tab[idx[7] + 1]
1554 };
1555 return v_int8x16(msa_ld1q_s8(elems));
1556}
1557inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1558{
1559 schar CV_DECL_ALIGNED(32) elems[16] =
1560 {
1561 tab[idx[0]],
1562 tab[idx[0] + 1],
1563 tab[idx[0] + 2],
1564 tab[idx[0] + 3],
1565 tab[idx[1]],
1566 tab[idx[1] + 1],
1567 tab[idx[1] + 2],
1568 tab[idx[1] + 3],
1569 tab[idx[2]],
1570 tab[idx[2] + 1],
1571 tab[idx[2] + 2],
1572 tab[idx[2] + 3],
1573 tab[idx[3]],
1574 tab[idx[3] + 1],
1575 tab[idx[3] + 2],
1576 tab[idx[3] + 3]
1577 };
1578 return v_int8x16(msa_ld1q_s8(elems));
1579}
1580inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1581inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1582inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1583
1584
1585inline v_int16x8 v_lut(const short* tab, const int* idx)
1586{
1587 short CV_DECL_ALIGNED(32) elems[8] =
1588 {
1589 tab[idx[0]],
1590 tab[idx[1]],
1591 tab[idx[2]],
1592 tab[idx[3]],
1593 tab[idx[4]],
1594 tab[idx[5]],
1595 tab[idx[6]],
1596 tab[idx[7]]
1597 };
1598 return v_int16x8(msa_ld1q_s16(elems));
1599}
1600inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1601{
1602 short CV_DECL_ALIGNED(32) elems[8] =
1603 {
1604 tab[idx[0]],
1605 tab[idx[0] + 1],
1606 tab[idx[1]],
1607 tab[idx[1] + 1],
1608 tab[idx[2]],
1609 tab[idx[2] + 1],
1610 tab[idx[3]],
1611 tab[idx[3] + 1]
1612 };
1613 return v_int16x8(msa_ld1q_s16(elems));
1614}
1615inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1616{
1617 return v_int16x8(msa_combine_s16(msa_ld1_s16(tab + idx[0]), msa_ld1_s16(tab + idx[1])));
1618}
1619inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
1620inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
1621inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
1622
1623inline v_int32x4 v_lut(const int* tab, const int* idx)
1624{
1625 int CV_DECL_ALIGNED(32) elems[4] =
1626 {
1627 tab[idx[0]],
1628 tab[idx[1]],
1629 tab[idx[2]],
1630 tab[idx[3]]
1631 };
1632 return v_int32x4(msa_ld1q_s32(elems));
1633}
1634inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1635{
1636 return v_int32x4(msa_combine_s32(msa_ld1_s32(tab + idx[0]), msa_ld1_s32(tab + idx[1])));
1637}
1638inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1639{
1640 return v_int32x4(msa_ld1q_s32(tab + idx[0]));
1641}
1642inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
1643inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
1644inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1645
1646inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1647{
1648 return v_int64x2(msa_combine_s64(msa_create_s64(tab[idx[0]]), msa_create_s64(tab[idx[1]])));
1649}
1650inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1651{
1652 return v_int64x2(msa_ld1q_s64(tab + idx[0]));
1653}
1654inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1655inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1656
1657inline v_float32x4 v_lut(const float* tab, const int* idx)
1658{
1659 float CV_DECL_ALIGNED(32) elems[4] =
1660 {
1661 tab[idx[0]],
1662 tab[idx[1]],
1663 tab[idx[2]],
1664 tab[idx[3]]
1665 };
1666 return v_float32x4(msa_ld1q_f32(elems));
1667}
1668inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1669{
1670 uint64 CV_DECL_ALIGNED(32) elems[2] =
1671 {
1672 *(uint64*)(tab + idx[0]),
1673 *(uint64*)(tab + idx[1])
1674 };
1675 return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems)));
1676}
1677inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1678{
1679 return v_float32x4(msa_ld1q_f32(tab + idx[0]));
1680}
1681
1682inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1683{
1684 int CV_DECL_ALIGNED(32) idx[4];
1685 v_store_aligned(idx, idxvec);
1686
1687 return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1688}
1689
1690inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1691{
1692 unsigned CV_DECL_ALIGNED(32) elems[4] =
1693 {
1694 tab[msa_getq_lane_s32(idxvec.val, 0)],
1695 tab[msa_getq_lane_s32(idxvec.val, 1)],
1696 tab[msa_getq_lane_s32(idxvec.val, 2)],
1697 tab[msa_getq_lane_s32(idxvec.val, 3)]
1698 };
1699 return v_uint32x4(msa_ld1q_u32(elems));
1700}
1701
1702inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1703{
1704 int CV_DECL_ALIGNED(32) idx[4];
1705 v_store_aligned(idx, idxvec);
1706
1707 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1708}
1709
1710inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1711{
1712 int CV_DECL_ALIGNED(32) idx[4];
1713 v_store_aligned(idx, idxvec);
1714
1715 v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + idx[0]), msa_ld1_f32(tab + idx[2]));
1716 v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + idx[1]), msa_ld1_f32(tab + idx[3]));
1717 x = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
1718 y = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
1719}
1720
1721inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
1722{
1723 v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val));
1724 return c;
1725}
1726inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
1727{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1728inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
1729{
1730 v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val));
1731 return c;
1732}
1733inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1734
1735inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
1736{
1737 v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val));
1738 return c;
1739}
1740
1741inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1742
1743inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
1744{
1745 v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val));
1746 return c;
1747}
1748
1749inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1750
1751inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
1752{
1753 v_int32x4 c;
1754 c.val[0] = vec.val[0];
1755 c.val[1] = vec.val[2];
1756 c.val[2] = vec.val[1];
1757 c.val[3] = vec.val[3];
1758 return c;
1759}
1760
1761inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1762inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1763
1764inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
1765{
1766 v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val));
1767 return c;
1768}
1769
1770inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1771
1772inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
1773{
1774 v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val));
1775 return c;
1776}
1777
1778inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1779inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
1780inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
1781inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
1782
1783inline v_float64x2 v_lut(const double* tab, const int* idx)
1784{
1785 double CV_DECL_ALIGNED(32) elems[2] =
1786 {
1787 tab[idx[0]],
1788 tab[idx[1]]
1789 };
1790 return v_float64x2(msa_ld1q_f64(elems));
1791}
1792
1793inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1794{
1795 return v_float64x2(msa_ld1q_f64(tab + idx[0]));
1796}
1797
1798inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1799{
1800 int CV_DECL_ALIGNED(32) idx[4];
1801 v_store_aligned(idx, idxvec);
1802
1803 return v_float64x2(tab[idx[0]], tab[idx[1]]);
1804}
1805
1806inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1807{
1808 int CV_DECL_ALIGNED(32) idx[4];
1809 v_store_aligned(idx, idxvec);
1810
1811 v2f64 xy0 = msa_ld1q_f64(tab + idx[0]);
1812 v2f64 xy1 = msa_ld1q_f64(tab + idx[1]);
1813 x = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
1814 y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
1815}
1816
1817template<int i, typename _Tp>
1818inline typename _Tp::lane_type v_extract_n(const _Tp& a)
1819{
1820 return v_rotate_right<i>(a).get0();
1821}
1822
1823template<int i>
1825{
1826 return v_setall_u32(v_extract_n<i>(a));
1827}
1828template<int i>
1830{
1831 return v_setall_s32(v_extract_n<i>(a));
1832}
1833template<int i>
1835{
1836 return v_setall_f32(v_extract_n<i>(a));
1837}
1838
1840#if CV_FP16
1841inline v_float32x4 v_load_expand(const hfloat* ptr)
1842{
1843#ifndef msa_ld1_f16
1844 v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
1845#else
1846 v4f16 v = msa_ld1_f16((const __fp16*)ptr);
1847#endif
1848 return v_float32x4(msa_cvt_f32_f16(v));
1849}
1850
1851inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
1852{
1853 v4f16 hv = msa_cvt_f16_f32(v.val);
1854
1855#ifndef msa_st1_f16
1856 msa_st1_s16((short*)ptr, (int16x4_t)hv);
1857#else
1858 msa_st1_f16((__fp16*)ptr, hv);
1859#endif
1860}
1861#else
1862inline v_float32x4 v_load_expand(const hfloat* ptr)
1863{
1864 float buf[4];
1865 for( int i = 0; i < 4; i++ )
1866 buf[i] = (float)ptr[i];
1867 return v_load(buf);
1868}
1869
1870inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
1871{
1872 float buf[4];
1873 v_store(buf, v);
1874 for( int i = 0; i < 4; i++ )
1875 ptr[i] = (hfloat)buf[i];
1876}
1877#endif
1878
1879inline void v_cleanup() {}
1880
1881CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
1882
1884
1885}
1886
1887#endif
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
#define CV_BIG_UINT(n)
Definition interface.h:64
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition intrin_cpp.hpp:2190
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition intrin_cpp.hpp:1033
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition intrin_cpp.hpp:1020
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition intrin_cpp.hpp:1584
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition intrin_cpp.hpp:1961
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
T max(T... args)
T min(T... args)
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441