EstervQrCode 1.1.1
Library for qr code manipulation
intrin_msa.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #ifndef OPENCV_HAL_INTRIN_MSA_HPP
6 #define OPENCV_HAL_INTRIN_MSA_HPP
7 
8 #include <algorithm>
9 #include "opencv2/core/utility.hpp"
10 
11 namespace cv
12 {
13 
15 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
16 
17 #define CV_SIMD128 1
18 
19 //MSA implements 128-bit wide vector registers shared with the 64-bit wide floating-point unit registers.
20 //MSA and FPU can not be both present, unless the FPU has 64-bit floating-point registers.
21 #define CV_SIMD128_64F 1
22 
23 struct v_uint8x16
24 {
25  typedef uchar lane_type;
26  enum { nlanes = 16 };
27 
28  v_uint8x16() {}
29  explicit v_uint8x16(v16u8 v) : val(v) {}
30  v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
31  uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
32  {
33  uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
34  val = msa_ld1q_u8(v);
35  }
36 
37  uchar get0() const
38  {
39  return msa_getq_lane_u8(val, 0);
40  }
41 
42  v16u8 val;
43 };
44 
45 struct v_int8x16
46 {
47  typedef schar lane_type;
48  enum { nlanes = 16 };
49 
50  v_int8x16() {}
51  explicit v_int8x16(v16i8 v) : val(v) {}
52  v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53  schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
54  {
55  schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56  val = msa_ld1q_s8(v);
57  }
58 
59  schar get0() const
60  {
61  return msa_getq_lane_s8(val, 0);
62  }
63 
64  v16i8 val;
65 };
66 
67 struct v_uint16x8
68 {
69  typedef ushort lane_type;
70  enum { nlanes = 8 };
71 
72  v_uint16x8() {}
73  explicit v_uint16x8(v8u16 v) : val(v) {}
74  v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
75  {
76  ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
77  val = msa_ld1q_u16(v);
78  }
79 
80  ushort get0() const
81  {
82  return msa_getq_lane_u16(val, 0);
83  }
84 
85  v8u16 val;
86 };
87 
88 struct v_int16x8
89 {
90  typedef short lane_type;
91  enum { nlanes = 8 };
92 
93  v_int16x8() {}
94  explicit v_int16x8(v8i16 v) : val(v) {}
95  v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
96  {
97  short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
98  val = msa_ld1q_s16(v);
99  }
100 
101  short get0() const
102  {
103  return msa_getq_lane_s16(val, 0);
104  }
105 
106  v8i16 val;
107 };
108 
109 struct v_uint32x4
110 {
111  typedef unsigned int lane_type;
112  enum { nlanes = 4 };
113 
114  v_uint32x4() {}
115  explicit v_uint32x4(v4u32 v) : val(v) {}
116  v_uint32x4(unsigned int v0, unsigned int v1, unsigned int v2, unsigned int v3)
117  {
118  unsigned int v[] = {v0, v1, v2, v3};
119  val = msa_ld1q_u32(v);
120  }
121 
122  unsigned int get0() const
123  {
124  return msa_getq_lane_u32(val, 0);
125  }
126 
127  v4u32 val;
128 };
129 
130 struct v_int32x4
131 {
132  typedef int lane_type;
133  enum { nlanes = 4 };
134 
135  v_int32x4() {}
136  explicit v_int32x4(v4i32 v) : val(v) {}
137  v_int32x4(int v0, int v1, int v2, int v3)
138  {
139  int v[] = {v0, v1, v2, v3};
140  val = msa_ld1q_s32(v);
141  }
142 
143  int get0() const
144  {
145  return msa_getq_lane_s32(val, 0);
146  }
147 
148  v4i32 val;
149 };
150 
151 struct v_float32x4
152 {
153  typedef float lane_type;
154  enum { nlanes = 4 };
155 
156  v_float32x4() {}
157  explicit v_float32x4(v4f32 v) : val(v) {}
158  v_float32x4(float v0, float v1, float v2, float v3)
159  {
160  float v[] = {v0, v1, v2, v3};
161  val = msa_ld1q_f32(v);
162  }
163 
164  float get0() const
165  {
166  return msa_getq_lane_f32(val, 0);
167  }
168 
169  v4f32 val;
170 };
171 
172 struct v_uint64x2
173 {
174  typedef uint64 lane_type;
175  enum { nlanes = 2 };
176 
177  v_uint64x2() {}
178  explicit v_uint64x2(v2u64 v) : val(v) {}
179  v_uint64x2(uint64 v0, uint64 v1)
180  {
181  uint64 v[] = {v0, v1};
182  val = msa_ld1q_u64(v);
183  }
184 
185  uint64 get0() const
186  {
187  return msa_getq_lane_u64(val, 0);
188  }
189 
190  v2u64 val;
191 };
192 
193 struct v_int64x2
194 {
195  typedef int64 lane_type;
196  enum { nlanes = 2 };
197 
198  v_int64x2() {}
199  explicit v_int64x2(v2i64 v) : val(v) {}
200  v_int64x2(int64 v0, int64 v1)
201  {
202  int64 v[] = {v0, v1};
203  val = msa_ld1q_s64(v);
204  }
205 
206  int64 get0() const
207  {
208  return msa_getq_lane_s64(val, 0);
209  }
210 
211  v2i64 val;
212 };
213 
214 struct v_float64x2
215 {
216  typedef double lane_type;
217  enum { nlanes = 2 };
218 
219  v_float64x2() {}
220  explicit v_float64x2(v2f64 v) : val(v) {}
221  v_float64x2(double v0, double v1)
222  {
223  double v[] = {v0, v1};
224  val = msa_ld1q_f64(v);
225  }
226 
227  double get0() const
228  {
229  return msa_getq_lane_f64(val, 0);
230  }
231 
232  v2f64 val;
233 };
234 
235 #define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
236 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
237 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
238 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
239 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
240 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
241 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \
242 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \
243 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \
244 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \
245 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \
246 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \
247 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); }
248 
249 OPENCV_HAL_IMPL_MSA_INIT(uint8x16, uchar, u8)
250 OPENCV_HAL_IMPL_MSA_INIT(int8x16, schar, s8)
251 OPENCV_HAL_IMPL_MSA_INIT(uint16x8, ushort, u16)
252 OPENCV_HAL_IMPL_MSA_INIT(int16x8, short, s16)
253 OPENCV_HAL_IMPL_MSA_INIT(uint32x4, unsigned int, u32)
254 OPENCV_HAL_IMPL_MSA_INIT(int32x4, int, s32)
255 OPENCV_HAL_IMPL_MSA_INIT(uint64x2, uint64, u64)
256 OPENCV_HAL_IMPL_MSA_INIT(int64x2, int64, s64)
257 OPENCV_HAL_IMPL_MSA_INIT(float32x4, float, f32)
258 OPENCV_HAL_IMPL_MSA_INIT(float64x2, double, f64)
259 
260 #define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \
261 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
262 { \
263  return _Tpvec(mov(a.val, b.val)); \
264 } \
265 template<int n> inline \
266 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
267 { \
268  return _Tpvec(rshr(a.val, b.val, n)); \
269 }
270 
271 OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_uint16x8, pack, msa_qpack_u16, msa_qrpackr_u16)
272 OPENCV_HAL_IMPL_MSA_PACK(v_int8x16, v_int16x8, pack, msa_qpack_s16, msa_qrpackr_s16)
273 OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_uint32x4, pack, msa_qpack_u32, msa_qrpackr_u32)
274 OPENCV_HAL_IMPL_MSA_PACK(v_int16x8, v_int32x4, pack, msa_qpack_s32, msa_qrpackr_s32)
275 OPENCV_HAL_IMPL_MSA_PACK(v_uint32x4, v_uint64x2, pack, msa_pack_u64, msa_rpackr_u64)
276 OPENCV_HAL_IMPL_MSA_PACK(v_int32x4, v_int64x2, pack, msa_pack_s64, msa_rpackr_s64)
277 OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_int16x8, pack_u, msa_qpacku_s16, msa_qrpackru_s16)
278 OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_int32x4, pack_u, msa_qpacku_s32, msa_qrpackru_s32)
279 
280 #define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
281 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
282 { \
283  hreg a1 = mov(a.val); \
284  msa_st1_##suffix(ptr, a1); \
285 } \
286 template<int n> inline \
287 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
288 { \
289  hreg a1 = rshr(a.val, n); \
290  msa_st1_##suffix(ptr, a1); \
291 }
292 
293 OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_uint16x8, pack, msa_qmovn_u16, msa_qrshrn_n_u16)
294 OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int8x16, schar, v8i8, s8, v_int16x8, pack, msa_qmovn_s16, msa_qrshrn_n_s16)
295 OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_uint32x4, pack, msa_qmovn_u32, msa_qrshrn_n_u32)
296 OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int16x8, short, v4i16, s16, v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32)
297 OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint32x4, unsigned, v2u32, u32, v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64)
298 OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int32x4, int, v2i32, s32, v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64)
299 OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_int16x8, pack_u, msa_qmovun_s16, msa_qrshrun_n_s16)
300 OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_int32x4, pack_u, msa_qmovun_s32, msa_qrshrun_n_s32)
301 
302 // pack boolean
303 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
304 {
305  return v_uint8x16(msa_pack_u16(a.val, b.val));
306 }
307 
308 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
309  const v_uint32x4& c, const v_uint32x4& d)
310 {
311  return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val)));
312 }
313 
314 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
315  const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
316  const v_uint64x2& g, const v_uint64x2& h)
317 {
318  v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val));
319  v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val));
320  return v_uint8x16(msa_pack_u16(abcd, efgh));
321 }
322 
323 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
324  const v_float32x4& m1, const v_float32x4& m2,
325  const v_float32x4& m3)
326 {
327  v4f32 v0 = v.val;
328  v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
329  res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
330  res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
331  res = msa_mlaq_lane_f32(res, m3.val, v0, 3);
332  return v_float32x4(res);
333 }
334 
335 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
336  const v_float32x4& m1, const v_float32x4& m2,
337  const v_float32x4& a)
338 {
339  v4f32 v0 = v.val;
340  v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
341  res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
342  res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
343  res = msa_addq_f32(res, a.val);
344  return v_float32x4(res);
345 }
346 
347 #define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
348 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
349 { \
350  return _Tpvec(intrin(a.val, b.val)); \
351 } \
352 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
353 { \
354  a.val = intrin(a.val, b.val); \
355  return a; \
356 }
357 
358 OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
359 OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
360 OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
361 OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
362 OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
363 OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
364 OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
365 OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
366 OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
367 OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
368 OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
369 OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
370 OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
371 OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
372 OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
373 OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
374 OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
375 OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
376 OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
377 OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
378 OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
379 OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
380 OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
381 OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
382 OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
383 OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
384 
385 // saturating multiply 8-bit, 16-bit
386 #define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \
387 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
388 { \
389  _Tpwvec c, d; \
390  v_mul_expand(a, b, c, d); \
391  return v_pack(c, d); \
392 } \
393 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
394 {a = a * b; return a; }
395 
396 OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8)
397 OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
398 OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int16x8, v_int32x4)
399 OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint16x8, v_uint32x4)
400 
401 // Multiply and expand
402 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
403  v_int16x8& c, v_int16x8& d)
404 {
405  v16i8 a_lo, a_hi, b_lo, b_hi;
406 
407  ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi);
408  ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi);
409  c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo));
410  d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi));
411 }
412 
413 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
414  v_uint16x8& c, v_uint16x8& d)
415 {
416  v16u8 a_lo, a_hi, b_lo, b_hi;
417 
418  ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi);
419  ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi);
420  c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo));
421  d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi));
422 }
423 
424 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
425  v_int32x4& c, v_int32x4& d)
426 {
427  v8i16 a_lo, a_hi, b_lo, b_hi;
428 
429  ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
430  ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
431  c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo));
432  d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi));
433 }
434 
435 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
436  v_uint32x4& c, v_uint32x4& d)
437 {
438  v8u16 a_lo, a_hi, b_lo, b_hi;
439 
440  ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
441  ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
442  c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo));
443  d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi));
444 }
445 
446 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
447  v_uint64x2& c, v_uint64x2& d)
448 {
449  v4u32 a_lo, a_hi, b_lo, b_hi;
450 
451  ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi);
452  ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi);
453  c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo));
454  d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi));
455 }
456 
457 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
458 {
459  v8i16 a_lo, a_hi, b_lo, b_hi;
460 
461  ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
462  ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
463 
464  return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)),
465  msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16));
466 }
467 
468 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
469 {
470  v8u16 a_lo, a_hi, b_lo, b_hi;
471 
472  ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
473  ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
474 
475  return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)),
476  msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
477 }
478 
480 
481 // 16 >> 32
482 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
483 { return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
484 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
485 { return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
486 
487 // 32 >> 64
488 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
489 { return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
490 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
491 { return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
492 
493 // 8 >> 32
494 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
495 {
496  v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
497  v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
498  v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
499  v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
500  v4u32 prod = msa_dotp_u_w(even_a, even_b);
501  return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
502 }
503 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
504 {
505  v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
506  v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
507  v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
508  v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
509  v4u32 prod = msa_dpadd_u_w(c.val, even_a, even_b);
510  return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
511 }
512 
513 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
514 {
515  v8i16 prod = msa_dotp_s_h(a.val, b.val);
516  return v_int32x4(msa_hadd_s32(prod, prod));
517 }
518 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
519  const v_int32x4& c)
520 { return v_dotprod_expand(a, b) + c; }
521 
522 // 16 >> 64
523 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
524 {
525  v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
526  v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
527  v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
528  v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
529  v2u64 prod = msa_dotp_u_d(even_a, even_b);
530  return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
531 }
532 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
533  const v_uint64x2& c)
534 {
535  v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
536  v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
537  v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
538  v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
539  v2u64 prod = msa_dpadd_u_d(c.val, even_a, even_b);
540  return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
541 }
542 
543 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
544 {
545  v4i32 prod = msa_dotp_s_w(a.val, b.val);
546  return v_int64x2(msa_hadd_s64(prod, prod));
547 }
548 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
549 { return v_dotprod_expand(a, b) + c; }
550 
551 // 32 >> 64f
552 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
553 { return v_cvt_f64(v_dotprod(a, b)); }
554 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
555 { return v_dotprod_expand(a, b) + c; }
556 
557 
559 
560 // 16 >> 32
561 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
562 { return v_dotprod(a, b); }
563 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
564 { return v_dotprod(a, b, c); }
565 
566 // 32 >> 64
567 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
568 { return v_dotprod(a, b); }
569 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
570 { return v_dotprod(a, b, c); }
571 
572 // 8 >> 32
573 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
574 { return v_dotprod_expand(a, b); }
575 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
576 { return v_dotprod_expand(a, b, c); }
577 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
578 { return v_dotprod_expand(a, b); }
579 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
580 { return v_dotprod_expand(a, b, c); }
581 
582 // 16 >> 64
583 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
584 { return v_dotprod_expand(a, b); }
585 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
586 { return v_dotprod_expand(a, b, c); }
587 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
588 { return v_dotprod_expand(a, b); }
589 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
590 { return v_dotprod_expand(a, b, c); }
591 
592 // 32 >> 64f
593 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
594 { return v_dotprod_expand(a, b); }
595 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
596 { return v_dotprod_expand(a, b, c); }
597 
598 #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
599 OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \
600 OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \
601 OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \
602 inline _Tpvec operator ~ (const _Tpvec& a) \
603 { \
604  return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
605 }
606 
607 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint8x16, v16u8, u8)
608 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int8x16, v16i8, s8)
609 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint16x8, v8u16, u16)
610 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int16x8, v8i16, s16)
611 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint32x4, v4u32, u32)
612 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int32x4, v4i32, s32)
613 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
614 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
615 
616 #define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
617 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
618 { \
619  return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
620 } \
621 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
622 { \
623  a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
624  return a; \
625 }
626 
627 OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
628 OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
629 OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
630 
631 inline v_float32x4 operator ~ (const v_float32x4& a)
632 {
633  return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
634 }
635 
636 /* v_abs */
637 #define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
638 inline _Tpuvec v_abs(const _Tpsvec& a) \
639 { \
640  return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \
641 }
642 
643 OPENCV_HAL_IMPL_MSA_ABS(v_uint8x16, v_int8x16, u8, s8)
644 OPENCV_HAL_IMPL_MSA_ABS(v_uint16x8, v_int16x8, u16, s16)
645 OPENCV_HAL_IMPL_MSA_ABS(v_uint32x4, v_int32x4, u32, s32)
646 
647 /* v_abs(float), v_sqrt, v_invsqrt */
648 #define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \
649 inline _Tpvec func(const _Tpvec& a) \
650 { \
651  return _Tpvec(intrin(a.val)); \
652 }
653 
654 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_abs, msa_absq_f32)
655 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_abs, msa_absq_f64)
656 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_sqrt, msa_sqrtq_f32)
657 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_invsqrt, msa_rsqrtq_f32)
658 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
659 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
660 
661 #define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
662 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
663 { \
664  return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
665 } \
666 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
667 { \
668  a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
669  return a; \
670 }
671 
672 OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
673 OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
674 OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
675 
676 inline v_float64x2 operator ~ (const v_float64x2& a)
677 {
678  return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
679 }
680 
681 // TODO: exp, log, sin, cos
682 
683 #define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \
684 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
685 { \
686  return _Tpvec(intrin(a.val, b.val)); \
687 }
688 
689 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_min, msa_minq_u8)
690 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_max, msa_maxq_u8)
691 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_min, msa_minq_s8)
692 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_max, msa_maxq_s8)
693 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_min, msa_minq_u16)
694 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_max, msa_maxq_u16)
695 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_min, msa_minq_s16)
696 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_max, msa_maxq_s16)
697 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_min, msa_minq_u32)
698 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_max, msa_maxq_u32)
699 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_min, msa_minq_s32)
700 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_max, msa_maxq_s32)
701 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_min, msa_minq_f32)
702 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_max, msa_maxq_f32)
703 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
704 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
705 
706 #define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
707 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
708 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
709 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
710 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
711 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
712 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
713 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
714 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
715 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
716 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
717 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
718 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
719 
720 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
721 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int8x16, v16i8, s8, u8)
722 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint16x8, v8u16, u16, u16)
723 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int16x8, v8i16, s16, u16)
724 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint32x4, v4u32, u32, u32)
725 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int32x4, v4i32, s32, u32)
726 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float32x4, v4f32, f32, u32)
727 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint64x2, v2u64, u64, u64)
728 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int64x2, v2i64, s64, u64)
729 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float64x2, v2f64, f64, u64)
730 
731 inline v_float32x4 v_not_nan(const v_float32x4& a)
732 { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); }
733 inline v_float64x2 v_not_nan(const v_float64x2& a)
734 { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); }
735 
736 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_add_wrap, msa_addq_u8)
737 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_add_wrap, msa_addq_s8)
738 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_add_wrap, msa_addq_u16)
739 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_add_wrap, msa_addq_s16)
740 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_sub_wrap, msa_subq_u8)
741 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_sub_wrap, msa_subq_s8)
742 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_sub_wrap, msa_subq_u16)
743 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_sub_wrap, msa_subq_s16)
744 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_mul_wrap, msa_mulq_u8)
745 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_mul_wrap, msa_mulq_s8)
746 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_mul_wrap, msa_mulq_u16)
747 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_mul_wrap, msa_mulq_s16)
748 
749 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_absdiff, msa_abdq_u8)
750 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_absdiff, msa_abdq_u16)
751 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_absdiff, msa_abdq_u32)
752 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_absdiff, msa_abdq_f32)
753 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_absdiff, msa_abdq_f64)
754 
755 
756 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_absdiffs, msa_qabdq_s8)
757 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_absdiffs, msa_qabdq_s16)
758 
759 #define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \
760 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
761 { \
762  return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \
763 }
764 
765 OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int8x16, v_uint8x16, v16u8, v_absdiff, msa_abdq_s8)
766 OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int16x8, v_uint16x8, v8u16, v_absdiff, msa_abdq_s16)
767 OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int32x4, v_uint32x4, v4u32, v_absdiff, msa_abdq_s32)
768 
769 /* v_magnitude, v_sqr_magnitude, v_fma, v_muladd */
770 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
771 {
772  v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
773  return v_sqrt(x);
774 }
775 
776 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
777 {
778  return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
779 }
780 
781 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
782 {
783  return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val));
784 }
785 
786 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
787 {
788  return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val));
789 }
790 
791 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
792 {
793  return v_fma(a, b, c);
794 }
795 
796 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
797 {
798  return v_fma(a, b, c);
799 }
800 
801 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
802 {
803  v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
804  return v_sqrt(x);
805 }
806 
807 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
808 {
809  return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
810 }
811 
812 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
813 {
814  return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val));
815 }
816 
817 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
818 {
819  return v_fma(a, b, c);
820 }
821 
822 // trade efficiency for convenience
823 #define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
824 inline _Tpvec operator << (const _Tpvec& a, int n) \
825 { return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
826 inline _Tpvec operator >> (const _Tpvec& a, int n) \
827 { return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
828 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
829 { return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
830 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
831 { return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \
832 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
833 { return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); }
834 
835 OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint8x16, u8, schar, s8)
836 OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int8x16, s8, schar, s8)
837 OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint16x8, u16, short, s16)
838 OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int16x8, s16, short, s16)
839 OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint32x4, u32, int, s32)
840 OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int32x4, s32, int, s32)
841 OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint64x2, u64, int64, s64)
842 OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int64x2, s64, int64, s64)
843 
844 /* v_rotate_right, v_rotate_left */
845 #define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \
846 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
847 { \
848  return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \
849 } \
850 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
851 { \
852  return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
853 } \
854 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
855 { \
856  return a; \
857 } \
858 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
859 { \
860  return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \
861 } \
862 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
863 { \
864  return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
865 } \
866 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
867 { \
868  CV_UNUSED(b); \
869  return a; \
870 }
871 
872 OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint8x16, v16u8, v16i8, s8)
873 OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int8x16, v16i8, v16i8, s8)
874 OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint16x8, v8u16, v8i16, s16)
875 OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int16x8, v8i16, v8i16, s16)
876 OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint32x4, v4u32, v4i32, s32)
877 OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int32x4, v4i32, v4i32, s32)
878 OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float32x4, v4f32, v4i32, s32)
879 OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint64x2, v2u64, v2i64, s64)
880 OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int64x2, v2i64, v2i64, s64)
881 OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float64x2, v2f64, v2i64, s64)
882 
883 #define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
884 inline _Tpvec v_load(const _Tp* ptr) \
885 { return _Tpvec(msa_ld1q_##suffix(ptr)); } \
886 inline _Tpvec v_load_aligned(const _Tp* ptr) \
887 { return _Tpvec(msa_ld1q_##suffix(ptr)); } \
888 inline _Tpvec v_load_low(const _Tp* ptr) \
889 { return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \
890 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
891 { return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \
892 inline void v_store(_Tp* ptr, const _Tpvec& a) \
893 { msa_st1q_##suffix(ptr, a.val); } \
894 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
895 { msa_st1q_##suffix(ptr, a.val); } \
896 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
897 { msa_st1q_##suffix(ptr, a.val); } \
898 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
899 { msa_st1q_##suffix(ptr, a.val); } \
900 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
901 { \
902  int n = _Tpvec::nlanes; \
903  for( int i = 0; i < (n/2); i++ ) \
904  ptr[i] = a.val[i]; \
905 } \
906 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
907 { \
908  int n = _Tpvec::nlanes; \
909  for( int i = 0; i < (n/2); i++ ) \
910  ptr[i] = a.val[i+(n/2)]; \
911 }
912 
913 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint8x16, uchar, u8)
914 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int8x16, schar, s8)
915 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint16x8, ushort, u16)
916 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int16x8, short, s16)
917 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint32x4, unsigned, u32)
918 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int32x4, int, s32)
919 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint64x2, uint64, u64)
920 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64)
921 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32)
922 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64)
923 
924 
925 
926 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
927 {
928  v_uint8x16 c = v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
929  return c;
930 }
931 
932 inline v_int8x16 v_reverse(const v_int8x16 &a)
933 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
934 
935 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
936 {
937  v_uint16x8 c = v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
938  return c;
939 }
940 
941 inline v_int16x8 v_reverse(const v_int16x8 &a)
942 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
943 
944 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
945 {
946  v_uint32x4 c;
947  c.val[0] = a.val[3];
948  c.val[1] = a.val[2];
949  c.val[2] = a.val[1];
950  c.val[3] = a.val[0];
951  return c;
952 }
953 
954 inline v_int32x4 v_reverse(const v_int32x4 &a)
955 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
956 
957 inline v_float32x4 v_reverse(const v_float32x4 &a)
958 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
959 
960 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
961 {
962  v_uint64x2 c;
963  c.val[0] = a.val[1];
964  c.val[1] = a.val[0];
965  return c;
966 }
967 
968 inline v_int64x2 v_reverse(const v_int64x2 &a)
969 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
970 
971 inline v_float64x2 v_reverse(const v_float64x2 &a)
972 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
973 
974 
975 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
976 inline unsigned short v_reduce_##func(const v_uint16x8& a) \
977 { \
978  v8u16 a_lo, a_hi; \
979  ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \
980  v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \
981  v4u32 b_lo, b_hi; \
982  ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \
983  v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \
984  return (unsigned short)cfunc(c[0], c[1]); \
985 }
986 
987 OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(max, std::max)
988 OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(min, std::min)
989 
990 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \
991 inline short v_reduce_##func(const v_int16x8& a) \
992 { \
993  v8i16 a_lo, a_hi; \
994  ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \
995  v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \
996  v4i32 b_lo, b_hi; \
997  ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \
998  v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \
999  return (short)cfunc(c[0], c[1]); \
1000 }
1001 
1002 OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(max, std::max)
1003 OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(min, std::min)
1004 
1005 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \
1006 inline scalartype v_reduce_##func(const _Tpvec& a) \
1007 { \
1008  return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \
1009 }
1010 
1011 OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1012 OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1013 OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, max, std::max)
1014 OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, min, std::min)
1015 OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, max, std::max)
1016 OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, min, std::min)
1017 
1018 
1019 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(_Tpvec, scalartype, _Tpvec2, func) \
1020 inline scalartype v_reduce_##func(const _Tpvec& a) \
1021 { \
1022  _Tpvec2 a1, a2; \
1023  v_expand(a, a1, a2); \
1024  return (scalartype)v_reduce_##func(v_##func(a1, a2)); \
1025 }
1026 
1027 OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, min)
1028 OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, max)
1029 OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, min)
1030 OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, max)
1031 
1032 
1033 
1034 #define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \
1035 inline scalartype v_reduce_sum(const _Tpvec& a) \
1036 { \
1037  return (scalartype)msa_sum_##suffix(a.val); \
1038 }
1039 
1040 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned short, u8)
1041 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, short, s8)
1042 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned, u16)
1043 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, int, s16)
1044 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, uint64_t, u32)
1045 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int64_t, s32)
1046 OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_float32x4, float, f32)
1047 
1048 inline uint64 v_reduce_sum(const v_uint64x2& a)
1049 { return (uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); }
1050 inline int64 v_reduce_sum(const v_int64x2& a)
1051 { return (int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); }
1052 inline double v_reduce_sum(const v_float64x2& a)
1053 {
1054  return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1);
1055 }
1056 
1057 /* v_reduce_sum4, v_reduce_sad */
1058 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1059  const v_float32x4& c, const v_float32x4& d)
1060 {
1061  v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))),
1062  MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val)))); // a0+a1 b0+b1 a2+a3 b2+b3
1063  v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))),
1064  MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val)))); // c0+c1 d0+d1 c2+c3 d2+d3
1065 
1066  return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))),
1067  MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0)))));
1068 }
1069 
1070 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1071 {
1072  v16u8 t0 = msa_abdq_u8(a.val, b.val);
1073  v8u16 t1 = msa_paddlq_u8(t0);
1074  v4u32 t2 = msa_paddlq_u16(t1);
1075  return msa_sum_u32(t2);
1076 }
1077 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1078 {
1079  v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val));
1080  v8u16 t1 = msa_paddlq_u8(t0);
1081  v4u32 t2 = msa_paddlq_u16(t1);
1082  return msa_sum_u32(t2);
1083 }
1084 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1085 {
1086  v8u16 t0 = msa_abdq_u16(a.val, b.val);
1087  v4u32 t1 = msa_paddlq_u16(t0);
1088  return msa_sum_u32(t1);
1089 }
1090 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1091 {
1092  v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val));
1093  v4u32 t1 = msa_paddlq_u16(t0);
1094  return msa_sum_u32(t1);
1095 }
1096 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1097 {
1098  v4u32 t0 = msa_abdq_u32(a.val, b.val);
1099  return msa_sum_u32(t0);
1100 }
1101 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1102 {
1103  v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val));
1104  return msa_sum_u32(t0);
1105 }
1106 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1107 {
1108  v4f32 t0 = msa_abdq_f32(a.val, b.val);
1109  return msa_sum_f32(t0);
1110 }
1111 
1112 /* v_popcount */
1113 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \
1114 inline v_uint8x16 v_popcount(const _Tpvec& a) \
1115 { \
1116  v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \
1117  return v_uint8x16(t); \
1118 }
1119 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_uint8x16)
1120 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_int8x16)
1121 
1122 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \
1123 inline v_uint16x8 v_popcount(const _Tpvec& a) \
1124 { \
1125  v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \
1126  return v_uint16x8(t); \
1127 }
1128 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_uint16x8)
1129 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_int16x8)
1130 
1131 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \
1132 inline v_uint32x4 v_popcount(const _Tpvec& a) \
1133 { \
1134  v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \
1135  return v_uint32x4(t); \
1136 }
1137 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_uint32x4)
1138 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_int32x4)
1139 
1140 #define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \
1141 inline v_uint64x2 v_popcount(const _Tpvec& a) \
1142 { \
1143  v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \
1144  return v_uint64x2(t); \
1145 }
1146 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_uint64x2)
1147 OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_int64x2)
1148 
1149 inline int v_signmask(const v_uint8x16& a)
1150 {
1151  v8i8 m0 = msa_create_s8(CV_BIG_UINT(0x0706050403020100));
1152  v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0));
1153  v8u16 v1 = msa_paddlq_u8(v0);
1154  v4u32 v2 = msa_paddlq_u16(v1);
1155  v2u64 v3 = msa_paddlq_u32(v2);
1156  return (int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8);
1157 }
1158 inline int v_signmask(const v_int8x16& a)
1159 { return v_signmask(v_reinterpret_as_u8(a)); }
1160 
1161 inline int v_signmask(const v_uint16x8& a)
1162 {
1163  v4i16 m0 = msa_create_s16(CV_BIG_UINT(0x0003000200010000));
1164  v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0));
1165  v4u32 v1 = msa_paddlq_u16(v0);
1166  v2u64 v2 = msa_paddlq_u32(v1);
1167  return (int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4);
1168 }
1169 inline int v_signmask(const v_int16x8& a)
1170 { return v_signmask(v_reinterpret_as_u16(a)); }
1171 
1172 inline int v_signmask(const v_uint32x4& a)
1173 {
1174  v2i32 m0 = msa_create_s32(CV_BIG_UINT(0x0000000100000000));
1175  v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0));
1176  v2u64 v1 = msa_paddlq_u32(v0);
1177  return (int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2);
1178 }
1179 inline int v_signmask(const v_int32x4& a)
1180 { return v_signmask(v_reinterpret_as_u32(a)); }
1181 inline int v_signmask(const v_float32x4& a)
1182 { return v_signmask(v_reinterpret_as_u32(a)); }
1183 
1184 inline int v_signmask(const v_uint64x2& a)
1185 {
1186  v2u64 v0 = msa_shrq_n_u64(a.val, 63);
1187  return (int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1);
1188 }
1189 inline int v_signmask(const v_int64x2& a)
1190 { return v_signmask(v_reinterpret_as_u64(a)); }
1191 inline int v_signmask(const v_float64x2& a)
1192 { return v_signmask(v_reinterpret_as_u64(a)); }
1193 
1194 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
1195 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
1196 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
1197 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
1198 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
1199 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
1200 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1201 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1202 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1203 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1204 
1205 #define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \
1206 inline bool v_check_all(const v_##_Tpvec& a) \
1207 { \
1208  _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \
1209  v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
1210  return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \
1211 } \
1212 inline bool v_check_any(const v_##_Tpvec& a) \
1213 { \
1214  _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \
1215  v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
1216  return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \
1217 }
1218 
1219 OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7)
1220 OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15)
1221 OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31)
1222 OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63)
1223 
1224 inline bool v_check_all(const v_int8x16& a)
1225 { return v_check_all(v_reinterpret_as_u8(a)); }
1226 inline bool v_check_all(const v_int16x8& a)
1227 { return v_check_all(v_reinterpret_as_u16(a)); }
1228 inline bool v_check_all(const v_int32x4& a)
1229 { return v_check_all(v_reinterpret_as_u32(a)); }
1230 inline bool v_check_all(const v_float32x4& a)
1231 { return v_check_all(v_reinterpret_as_u32(a)); }
1232 
1233 inline bool v_check_any(const v_int8x16& a)
1234 { return v_check_any(v_reinterpret_as_u8(a)); }
1235 inline bool v_check_any(const v_int16x8& a)
1236 { return v_check_any(v_reinterpret_as_u16(a)); }
1237 inline bool v_check_any(const v_int32x4& a)
1238 { return v_check_any(v_reinterpret_as_u32(a)); }
1239 inline bool v_check_any(const v_float32x4& a)
1240 { return v_check_any(v_reinterpret_as_u32(a)); }
1241 
1242 inline bool v_check_all(const v_int64x2& a)
1243 { return v_check_all(v_reinterpret_as_u64(a)); }
1244 inline bool v_check_all(const v_float64x2& a)
1245 { return v_check_all(v_reinterpret_as_u64(a)); }
1246 inline bool v_check_any(const v_int64x2& a)
1247 { return v_check_any(v_reinterpret_as_u64(a)); }
1248 inline bool v_check_any(const v_float64x2& a)
1249 { return v_check_any(v_reinterpret_as_u64(a)); }
1250 
1251 /* v_select */
1252 #define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \
1253 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1254 { \
1255  return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \
1256  MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \
1257 }
1258 
1259 OPENCV_HAL_IMPL_MSA_SELECT(v_uint8x16, v16u8, v16u8)
1260 OPENCV_HAL_IMPL_MSA_SELECT(v_int8x16, v16i8, v16u8)
1261 OPENCV_HAL_IMPL_MSA_SELECT(v_uint16x8, v8u16, v16u8)
1262 OPENCV_HAL_IMPL_MSA_SELECT(v_int16x8, v8i16, v16u8)
1263 OPENCV_HAL_IMPL_MSA_SELECT(v_uint32x4, v4u32, v16u8)
1264 OPENCV_HAL_IMPL_MSA_SELECT(v_int32x4, v4i32, v16u8)
1265 OPENCV_HAL_IMPL_MSA_SELECT(v_float32x4, v4f32, v16u8)
1266 OPENCV_HAL_IMPL_MSA_SELECT(v_float64x2, v2f64, v16u8)
1267 
1268 #define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \
1269 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1270 { \
1271  _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1272  _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1273  b0.val = msa_paddlq_##suffix(a_lo); \
1274  b1.val = msa_paddlq_##suffix(a_hi); \
1275 } \
1276 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1277 { \
1278  _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1279  return _Tpwvec(msa_paddlq_##suffix(a_lo)); \
1280 } \
1281 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1282 { \
1283  _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
1284  return _Tpwvec(msa_paddlq_##suffix(a_hi)); \
1285 } \
1286 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1287 { \
1288  return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \
1289 }
1290 
1291 OPENCV_HAL_IMPL_MSA_EXPAND(v_uint8x16, v_uint16x8, uchar, u8, s8, v16u8, v16i8)
1292 OPENCV_HAL_IMPL_MSA_EXPAND(v_int8x16, v_int16x8, schar, s8, s8, v16i8, v16i8)
1293 OPENCV_HAL_IMPL_MSA_EXPAND(v_uint16x8, v_uint32x4, ushort, u16, s16, v8u16, v8i16)
1294 OPENCV_HAL_IMPL_MSA_EXPAND(v_int16x8, v_int32x4, short, s16, s16, v8i16, v8i16)
1295 OPENCV_HAL_IMPL_MSA_EXPAND(v_uint32x4, v_uint64x2, uint, u32, s32, v4u32, v4i32)
1296 OPENCV_HAL_IMPL_MSA_EXPAND(v_int32x4, v_int64x2, int, s32, s32, v4i32, v4i32)
1297 
1298 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1299 {
1300  return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]});
1301 }
1302 
1303 inline v_int32x4 v_load_expand_q(const schar* ptr)
1304 {
1305  return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]});
1306 }
1307 
1308 /* v_zip, v_combine_low, v_combine_high, v_recombine */
1309 #define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \
1310 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1311 { \
1312  b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1313  b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1314 } \
1315 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1316 { \
1317  return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
1318 } \
1319 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1320 { \
1321  return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
1322 } \
1323 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1324 { \
1325  c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
1326  d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
1327 }
1328 
1329 OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint8x16, v16u8, v16i8, s8)
1330 OPENCV_HAL_IMPL_MSA_UNPACKS(v_int8x16, v16i8, v16i8, s8)
1331 OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint16x8, v8u16, v8i16, s16)
1332 OPENCV_HAL_IMPL_MSA_UNPACKS(v_int16x8, v8i16, v8i16, s16)
1333 OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint32x4, v4u32, v4i32, s32)
1334 OPENCV_HAL_IMPL_MSA_UNPACKS(v_int32x4, v4i32, v4i32, s32)
1335 OPENCV_HAL_IMPL_MSA_UNPACKS(v_float32x4, v4f32, v4i32, s32)
1336 OPENCV_HAL_IMPL_MSA_UNPACKS(v_float64x2, v2f64, v2i64, s64)
1337 
1338 /* v_extract */
1339 #define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \
1340 template <int s> \
1341 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1342 { \
1343  return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \
1344 }
1345 
1346 OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint8x16, v16u8, v16i8, s8)
1347 OPENCV_HAL_IMPL_MSA_EXTRACT(v_int8x16, v16i8, v16i8, s8)
1348 OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint16x8, v8u16, v8i16, s16)
1349 OPENCV_HAL_IMPL_MSA_EXTRACT(v_int16x8, v8i16, v8i16, s16)
1350 OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint32x4, v4u32, v4i32, s32)
1351 OPENCV_HAL_IMPL_MSA_EXTRACT(v_int32x4, v4i32, v4i32, s32)
1352 OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint64x2, v2u64, v2i64, s64)
1353 OPENCV_HAL_IMPL_MSA_EXTRACT(v_int64x2, v2i64, v2i64, s64)
1354 OPENCV_HAL_IMPL_MSA_EXTRACT(v_float32x4, v4f32, v4i32, s32)
1355 OPENCV_HAL_IMPL_MSA_EXTRACT(v_float64x2, v2f64, v2i64, s64)
1356 
1357 /* v_round, v_floor, v_ceil, v_trunc */
1358 inline v_int32x4 v_round(const v_float32x4& a)
1359 {
1360  return v_int32x4(msa_cvttintq_s32_f32(a.val));
1361 }
1362 
1363 inline v_int32x4 v_floor(const v_float32x4& a)
1364 {
1365  v4i32 a1 = msa_cvttintq_s32_f32(a.val);
1366  return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val))));
1367 }
1368 
1369 inline v_int32x4 v_ceil(const v_float32x4& a)
1370 {
1371  v4i32 a1 = msa_cvttintq_s32_f32(a.val);
1372  return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1)))));
1373 }
1374 
1375 inline v_int32x4 v_trunc(const v_float32x4& a)
1376 {
1377  return v_int32x4(msa_cvttruncq_s32_f32(a.val));
1378 }
1379 
1380 inline v_int32x4 v_round(const v_float64x2& a)
1381 {
1382  return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0)));
1383 }
1384 
1385 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1386 {
1387  return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val)));
1388 }
1389 
1390 inline v_int32x4 v_floor(const v_float64x2& a)
1391 {
1392  v2f64 a1 = msa_cvtrintq_f64(a.val);
1393  return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0)));
1394 }
1395 
1396 inline v_int32x4 v_ceil(const v_float64x2& a)
1397 {
1398  v2f64 a1 = msa_cvtrintq_f64(a.val);
1399  return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0)));
1400 }
1401 
1402 inline v_int32x4 v_trunc(const v_float64x2& a)
1403 {
1404  return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0)));
1405 }
1406 
1407 #define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \
1408 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1409  const _Tpvec& a2, const _Tpvec& a3, \
1410  _Tpvec& b0, _Tpvec& b1, \
1411  _Tpvec& b2, _Tpvec& b3) \
1412 { \
1413  _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1414  _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
1415  _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
1416  _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
1417  b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
1418  b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
1419  b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
1420  b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
1421 }
1422 
1423 OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_uint32x4, v4u32, v4i32, s32)
1424 OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_int32x4, v4i32, v4i32, s32)
1425 OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_float32x4, v4f32, v4i32, s32)
1426 
1427 #define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \
1428 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1429 { \
1430  msa_ld2q_##suffix(ptr, &a.val, &b.val); \
1431 } \
1432 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1433 { \
1434  msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \
1435 } \
1436 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1437  v_##_Tpvec& c, v_##_Tpvec& d) \
1438 { \
1439  msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \
1440 } \
1441 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1442  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1443 { \
1444  msa_st2q_##suffix(ptr, a.val, b.val); \
1445 } \
1446 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1447  const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1448 { \
1449  msa_st3q_##suffix(ptr, a.val, b.val, c.val); \
1450 } \
1451 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1452  const v_##_Tpvec& c, const v_##_Tpvec& d, \
1453  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
1454 { \
1455  msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \
1456 }
1457 
1458 OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, uchar, u8)
1459 OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, schar, s8)
1460 OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, ushort, u16)
1461 OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8, short, s16)
1462 OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4, unsigned, u32)
1463 OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4, int, s32)
1464 OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4, float, f32)
1465 OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, uint64, u64)
1466 OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, int64, s64)
1467 OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2, double, f64)
1468 
1469 /* v_cvt_f32, v_cvt_f64, v_cvt_f64_high */
1470 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1471 {
1472  return v_float32x4(msa_cvtfintq_f32_s32(a.val));
1473 }
1474 
1475 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1476 {
1477  return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f)));
1478 }
1479 
1480 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1481 {
1482  return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val));
1483 }
1484 
1485 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1486 {
1487  return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
1488 }
1489 
1490 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1491 {
1492  return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
1493 }
1494 
1495 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1496 {
1497  return v_float64x2(msa_cvtflq_f64_f32(a.val));
1498 }
1499 
1500 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1501 {
1502  return v_float64x2(msa_cvtfhq_f64_f32(a.val));
1503 }
1504 
1505 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1506 {
1507  return v_float64x2(msa_cvtfintq_f64_s64(a.val));
1508 }
1509 
1511 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1512 {
1513  schar CV_DECL_ALIGNED(32) elems[16] =
1514  {
1515  tab[idx[ 0]],
1516  tab[idx[ 1]],
1517  tab[idx[ 2]],
1518  tab[idx[ 3]],
1519  tab[idx[ 4]],
1520  tab[idx[ 5]],
1521  tab[idx[ 6]],
1522  tab[idx[ 7]],
1523  tab[idx[ 8]],
1524  tab[idx[ 9]],
1525  tab[idx[10]],
1526  tab[idx[11]],
1527  tab[idx[12]],
1528  tab[idx[13]],
1529  tab[idx[14]],
1530  tab[idx[15]]
1531  };
1532  return v_int8x16(msa_ld1q_s8(elems));
1533 }
1534 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1535 {
1536  schar CV_DECL_ALIGNED(32) elems[16] =
1537  {
1538  tab[idx[0]],
1539  tab[idx[0] + 1],
1540  tab[idx[1]],
1541  tab[idx[1] + 1],
1542  tab[idx[2]],
1543  tab[idx[2] + 1],
1544  tab[idx[3]],
1545  tab[idx[3] + 1],
1546  tab[idx[4]],
1547  tab[idx[4] + 1],
1548  tab[idx[5]],
1549  tab[idx[5] + 1],
1550  tab[idx[6]],
1551  tab[idx[6] + 1],
1552  tab[idx[7]],
1553  tab[idx[7] + 1]
1554  };
1555  return v_int8x16(msa_ld1q_s8(elems));
1556 }
1557 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1558 {
1559  schar CV_DECL_ALIGNED(32) elems[16] =
1560  {
1561  tab[idx[0]],
1562  tab[idx[0] + 1],
1563  tab[idx[0] + 2],
1564  tab[idx[0] + 3],
1565  tab[idx[1]],
1566  tab[idx[1] + 1],
1567  tab[idx[1] + 2],
1568  tab[idx[1] + 3],
1569  tab[idx[2]],
1570  tab[idx[2] + 1],
1571  tab[idx[2] + 2],
1572  tab[idx[2] + 3],
1573  tab[idx[3]],
1574  tab[idx[3] + 1],
1575  tab[idx[3] + 2],
1576  tab[idx[3] + 3]
1577  };
1578  return v_int8x16(msa_ld1q_s8(elems));
1579 }
1580 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1581 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1582 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1583 
1584 
1585 inline v_int16x8 v_lut(const short* tab, const int* idx)
1586 {
1587  short CV_DECL_ALIGNED(32) elems[8] =
1588  {
1589  tab[idx[0]],
1590  tab[idx[1]],
1591  tab[idx[2]],
1592  tab[idx[3]],
1593  tab[idx[4]],
1594  tab[idx[5]],
1595  tab[idx[6]],
1596  tab[idx[7]]
1597  };
1598  return v_int16x8(msa_ld1q_s16(elems));
1599 }
1600 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1601 {
1602  short CV_DECL_ALIGNED(32) elems[8] =
1603  {
1604  tab[idx[0]],
1605  tab[idx[0] + 1],
1606  tab[idx[1]],
1607  tab[idx[1] + 1],
1608  tab[idx[2]],
1609  tab[idx[2] + 1],
1610  tab[idx[3]],
1611  tab[idx[3] + 1]
1612  };
1613  return v_int16x8(msa_ld1q_s16(elems));
1614 }
1615 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1616 {
1617  return v_int16x8(msa_combine_s16(msa_ld1_s16(tab + idx[0]), msa_ld1_s16(tab + idx[1])));
1618 }
1619 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
1620 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
1621 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
1622 
1623 inline v_int32x4 v_lut(const int* tab, const int* idx)
1624 {
1625  int CV_DECL_ALIGNED(32) elems[4] =
1626  {
1627  tab[idx[0]],
1628  tab[idx[1]],
1629  tab[idx[2]],
1630  tab[idx[3]]
1631  };
1632  return v_int32x4(msa_ld1q_s32(elems));
1633 }
1634 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1635 {
1636  return v_int32x4(msa_combine_s32(msa_ld1_s32(tab + idx[0]), msa_ld1_s32(tab + idx[1])));
1637 }
1638 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1639 {
1640  return v_int32x4(msa_ld1q_s32(tab + idx[0]));
1641 }
1642 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
1643 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
1644 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1645 
1646 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1647 {
1648  return v_int64x2(msa_combine_s64(msa_create_s64(tab[idx[0]]), msa_create_s64(tab[idx[1]])));
1649 }
1650 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1651 {
1652  return v_int64x2(msa_ld1q_s64(tab + idx[0]));
1653 }
1654 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1655 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1656 
1657 inline v_float32x4 v_lut(const float* tab, const int* idx)
1658 {
1659  float CV_DECL_ALIGNED(32) elems[4] =
1660  {
1661  tab[idx[0]],
1662  tab[idx[1]],
1663  tab[idx[2]],
1664  tab[idx[3]]
1665  };
1666  return v_float32x4(msa_ld1q_f32(elems));
1667 }
1668 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1669 {
1670  uint64 CV_DECL_ALIGNED(32) elems[2] =
1671  {
1672  *(uint64*)(tab + idx[0]),
1673  *(uint64*)(tab + idx[1])
1674  };
1675  return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems)));
1676 }
1677 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1678 {
1679  return v_float32x4(msa_ld1q_f32(tab + idx[0]));
1680 }
1681 
1682 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1683 {
1684  int CV_DECL_ALIGNED(32) idx[4];
1685  v_store_aligned(idx, idxvec);
1686 
1687  return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1688 }
1689 
1690 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1691 {
1692  unsigned CV_DECL_ALIGNED(32) elems[4] =
1693  {
1694  tab[msa_getq_lane_s32(idxvec.val, 0)],
1695  tab[msa_getq_lane_s32(idxvec.val, 1)],
1696  tab[msa_getq_lane_s32(idxvec.val, 2)],
1697  tab[msa_getq_lane_s32(idxvec.val, 3)]
1698  };
1699  return v_uint32x4(msa_ld1q_u32(elems));
1700 }
1701 
1702 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1703 {
1704  int CV_DECL_ALIGNED(32) idx[4];
1705  v_store_aligned(idx, idxvec);
1706 
1707  return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1708 }
1709 
1710 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1711 {
1712  int CV_DECL_ALIGNED(32) idx[4];
1713  v_store_aligned(idx, idxvec);
1714 
1715  v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + idx[0]), msa_ld1_f32(tab + idx[2]));
1716  v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + idx[1]), msa_ld1_f32(tab + idx[3]));
1717  x = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
1718  y = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
1719 }
1720 
1721 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
1722 {
1723  v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val));
1724  return c;
1725 }
1726 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
1727 { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1728 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
1729 {
1730  v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val));
1731  return c;
1732 }
1733 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1734 
1735 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
1736 {
1737  v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val));
1738  return c;
1739 }
1740 
1741 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1742 
1743 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
1744 {
1745  v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val));
1746  return c;
1747 }
1748 
1749 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1750 
1751 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
1752 {
1753  v_int32x4 c;
1754  c.val[0] = vec.val[0];
1755  c.val[1] = vec.val[2];
1756  c.val[2] = vec.val[1];
1757  c.val[3] = vec.val[3];
1758  return c;
1759 }
1760 
1761 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1762 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1763 
1764 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
1765 {
1766  v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val));
1767  return c;
1768 }
1769 
1770 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1771 
1772 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
1773 {
1774  v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val));
1775  return c;
1776 }
1777 
1778 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1779 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
1780 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
1781 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
1782 
1783 inline v_float64x2 v_lut(const double* tab, const int* idx)
1784 {
1785  double CV_DECL_ALIGNED(32) elems[2] =
1786  {
1787  tab[idx[0]],
1788  tab[idx[1]]
1789  };
1790  return v_float64x2(msa_ld1q_f64(elems));
1791 }
1792 
1793 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1794 {
1795  return v_float64x2(msa_ld1q_f64(tab + idx[0]));
1796 }
1797 
1798 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1799 {
1800  int CV_DECL_ALIGNED(32) idx[4];
1801  v_store_aligned(idx, idxvec);
1802 
1803  return v_float64x2(tab[idx[0]], tab[idx[1]]);
1804 }
1805 
1806 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1807 {
1808  int CV_DECL_ALIGNED(32) idx[4];
1809  v_store_aligned(idx, idxvec);
1810 
1811  v2f64 xy0 = msa_ld1q_f64(tab + idx[0]);
1812  v2f64 xy1 = msa_ld1q_f64(tab + idx[1]);
1813  x = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
1814  y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
1815 }
1816 
1817 template<int i, typename _Tp>
1818 inline typename _Tp::lane_type v_extract_n(const _Tp& a)
1819 {
1820  return v_rotate_right<i>(a).get0();
1821 }
1822 
1823 template<int i>
1824 inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
1825 {
1826  return v_setall_u32(v_extract_n<i>(a));
1827 }
1828 template<int i>
1829 inline v_int32x4 v_broadcast_element(const v_int32x4& a)
1830 {
1831  return v_setall_s32(v_extract_n<i>(a));
1832 }
1833 template<int i>
1835 {
1836  return v_setall_f32(v_extract_n<i>(a));
1837 }
1838 
1840 #if CV_FP16
1841 inline v_float32x4 v_load_expand(const hfloat* ptr)
1842 {
1843 #ifndef msa_ld1_f16
1844  v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
1845 #else
1846  v4f16 v = msa_ld1_f16((const __fp16*)ptr);
1847 #endif
1848  return v_float32x4(msa_cvt_f32_f16(v));
1849 }
1850 
1851 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
1852 {
1853  v4f16 hv = msa_cvt_f16_f32(v.val);
1854 
1855 #ifndef msa_st1_f16
1856  msa_st1_s16((short*)ptr, (int16x4_t)hv);
1857 #else
1858  msa_st1_f16((__fp16*)ptr, hv);
1859 #endif
1860 }
1861 #else
1862 inline v_float32x4 v_load_expand(const hfloat* ptr)
1863 {
1864  float buf[4];
1865  for( int i = 0; i < 4; i++ )
1866  buf[i] = (float)ptr[i];
1867  return v_load(buf);
1868 }
1869 
1870 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
1871 {
1872  float buf[4];
1873  v_store(buf, v);
1874  for( int i = 0; i < 4; i++ )
1875  ptr[i] = (hfloat)buf[i];
1876 }
1877 #endif
1878 
1879 inline void v_cleanup() {}
1880 
1881 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
1882 
1884 
1885 }
1886 
1887 #endif
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
#define CV_BIG_UINT(n)
Definition: interface.h:64
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition: intrin_cpp.hpp:1584
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition: intrin_cpp.hpp:2190
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition: intrin_cpp.hpp:1020
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition: intrin_cpp.hpp:1033
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
T max(T... args)
T min(T... args)
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437