EstervQrCode 1.1.1
Library for qr code manipulation
intrin_vsx.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4 
5 #ifndef OPENCV_HAL_VSX_HPP
6 #define OPENCV_HAL_VSX_HPP
7 
8 #include <algorithm>
9 #include "opencv2/core/utility.hpp"
10 
11 #define CV_SIMD128 1
12 #define CV_SIMD128_64F 1
13 
14 namespace cv
15 {
16 
18 
19 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
20 
22 
23 struct v_uint8x16
24 {
25  typedef uchar lane_type;
26  enum { nlanes = 16 };
27  vec_uchar16 val;
28 
29  explicit v_uint8x16(const vec_uchar16& v) : val(v)
30  {}
31  v_uint8x16()
32  {}
33  v_uint8x16(vec_bchar16 v) : val(vec_uchar16_c(v))
34  {}
35  v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
36  uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
37  : val(vec_uchar16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
38  {}
39 
40  static inline v_uint8x16 zero() { return v_uint8x16(vec_uchar16_z); }
41 
42  uchar get0() const
43  { return vec_extract(val, 0); }
44 };
45 
46 struct v_int8x16
47 {
48  typedef schar lane_type;
49  enum { nlanes = 16 };
50  vec_char16 val;
51 
52  explicit v_int8x16(const vec_char16& v) : val(v)
53  {}
54  v_int8x16()
55  {}
56  v_int8x16(vec_bchar16 v) : val(vec_char16_c(v))
57  {}
58  v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
59  schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
60  : val(vec_char16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
61  {}
62 
63  static inline v_int8x16 zero() { return v_int8x16(vec_char16_z); }
64 
65  schar get0() const
66  { return vec_extract(val, 0); }
67 };
68 
69 struct v_uint16x8
70 {
71  typedef ushort lane_type;
72  enum { nlanes = 8 };
73  vec_ushort8 val;
74 
75  explicit v_uint16x8(const vec_ushort8& v) : val(v)
76  {}
77  v_uint16x8()
78  {}
79  v_uint16x8(vec_bshort8 v) : val(vec_ushort8_c(v))
80  {}
81  v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
82  : val(vec_ushort8_set(v0, v1, v2, v3, v4, v5, v6, v7))
83  {}
84 
85  static inline v_uint16x8 zero() { return v_uint16x8(vec_ushort8_z); }
86 
87  ushort get0() const
88  { return vec_extract(val, 0); }
89 };
90 
91 struct v_int16x8
92 {
93  typedef short lane_type;
94  enum { nlanes = 8 };
95  vec_short8 val;
96 
97  explicit v_int16x8(const vec_short8& v) : val(v)
98  {}
99  v_int16x8()
100  {}
101  v_int16x8(vec_bshort8 v) : val(vec_short8_c(v))
102  {}
103  v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
104  : val(vec_short8_set(v0, v1, v2, v3, v4, v5, v6, v7))
105  {}
106 
107  static inline v_int16x8 zero() { return v_int16x8(vec_short8_z); }
108 
109  short get0() const
110  { return vec_extract(val, 0); }
111 };
112 
113 struct v_uint32x4
114 {
115  typedef unsigned lane_type;
116  enum { nlanes = 4 };
117  vec_uint4 val;
118 
119  explicit v_uint32x4(const vec_uint4& v) : val(v)
120  {}
121  v_uint32x4()
122  {}
123  v_uint32x4(vec_bint4 v) : val(vec_uint4_c(v))
124  {}
125  v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) : val(vec_uint4_set(v0, v1, v2, v3))
126  {}
127 
128  static inline v_uint32x4 zero() { return v_uint32x4(vec_uint4_z); }
129 
130  uint get0() const
131  { return vec_extract(val, 0); }
132 };
133 
134 struct v_int32x4
135 {
136  typedef int lane_type;
137  enum { nlanes = 4 };
138  vec_int4 val;
139 
140  explicit v_int32x4(const vec_int4& v) : val(v)
141  {}
142  v_int32x4()
143  {}
144  v_int32x4(vec_bint4 v) : val(vec_int4_c(v))
145  {}
146  v_int32x4(int v0, int v1, int v2, int v3) : val(vec_int4_set(v0, v1, v2, v3))
147  {}
148 
149  static inline v_int32x4 zero() { return v_int32x4(vec_int4_z); }
150 
151  int get0() const
152  { return vec_extract(val, 0); }
153 };
154 
155 struct v_float32x4
156 {
157  typedef float lane_type;
158  enum { nlanes = 4 };
159  vec_float4 val;
160 
161  explicit v_float32x4(const vec_float4& v) : val(v)
162  {}
163  v_float32x4()
164  {}
165  v_float32x4(vec_bint4 v) : val(vec_float4_c(v))
166  {}
167  v_float32x4(float v0, float v1, float v2, float v3) : val(vec_float4_set(v0, v1, v2, v3))
168  {}
169 
170  static inline v_float32x4 zero() { return v_float32x4(vec_float4_z); }
171 
172  float get0() const
173  { return vec_extract(val, 0); }
174 };
175 
176 struct v_uint64x2
177 {
178  typedef uint64 lane_type;
179  enum { nlanes = 2 };
180  vec_udword2 val;
181 
182  explicit v_uint64x2(const vec_udword2& v) : val(v)
183  {}
184  v_uint64x2()
185  {}
186  v_uint64x2(vec_bdword2 v) : val(vec_udword2_c(v))
187  {}
188  v_uint64x2(uint64 v0, uint64 v1) : val(vec_udword2_set(v0, v1))
189  {}
190 
191  static inline v_uint64x2 zero() { return v_uint64x2(vec_udword2_z); }
192 
193  uint64 get0() const
194  { return vec_extract(val, 0); }
195 };
196 
197 struct v_int64x2
198 {
199  typedef int64 lane_type;
200  enum { nlanes = 2 };
201  vec_dword2 val;
202 
203  explicit v_int64x2(const vec_dword2& v) : val(v)
204  {}
205  v_int64x2()
206  {}
207  v_int64x2(vec_bdword2 v) : val(vec_dword2_c(v))
208  {}
209  v_int64x2(int64 v0, int64 v1) : val(vec_dword2_set(v0, v1))
210  {}
211 
212  static inline v_int64x2 zero() { return v_int64x2(vec_dword2_z); }
213 
214  int64 get0() const
215  { return vec_extract(val, 0); }
216 };
217 
218 struct v_float64x2
219 {
220  typedef double lane_type;
221  enum { nlanes = 2 };
222  vec_double2 val;
223 
224  explicit v_float64x2(const vec_double2& v) : val(v)
225  {}
226  v_float64x2()
227  {}
228  v_float64x2(vec_bdword2 v) : val(vec_double2_c(v))
229  {}
230  v_float64x2(double v0, double v1) : val(vec_double2_set(v0, v1))
231  {}
232 
233  static inline v_float64x2 zero() { return v_float64x2(vec_double2_z); }
234 
235  double get0() const
236  { return vec_extract(val, 0); }
237 };
238 
239 #define OPENCV_HAL_IMPL_VSX_EXTRACT_N(_Tpvec, _Tp) \
240 template<int i> inline _Tp v_extract_n(VSX_UNUSED(_Tpvec v)) { return vec_extract(v.val, i); }
241 
242 OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint8x16, uchar)
243 OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int8x16, schar)
244 OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint16x8, ushort)
245 OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int16x8, short)
246 OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint32x4, uint)
247 OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int32x4, int)
248 OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint64x2, uint64)
249 OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int64x2, int64)
250 OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float32x4, float)
251 OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
252 
253 
255 /*
256  * clang-5 aborted during parse "vec_xxx_c" only if it's
257  * inside a function template which is defined by preprocessor macro.
258  *
259  * if vec_xxx_c defined as C++ cast, clang-5 will pass it
260 */
261 #define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \
262 inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); } \
263 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \
264 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \
265 { return _Tpvec((cast)a.val); }
266 
267 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint8x16, uchar, u8, vec_uchar16)
268 OPENCV_HAL_IMPL_VSX_INITVEC(v_int8x16, schar, s8, vec_char16)
269 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint16x8, ushort, u16, vec_ushort8)
270 OPENCV_HAL_IMPL_VSX_INITVEC(v_int16x8, short, s16, vec_short8)
271 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint32x4, uint, u32, vec_uint4)
272 OPENCV_HAL_IMPL_VSX_INITVEC(v_int32x4, int, s32, vec_int4)
273 OPENCV_HAL_IMPL_VSX_INITVEC(v_uint64x2, uint64, u64, vec_udword2)
274 OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2)
275 OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4)
276 OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2)
277 
278 #define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a) \
279 inline _Tpvec v_load(const _Tp* ptr) \
280 { return _Tpvec(ld(0, ptr)); } \
281 inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr)) \
282 { return _Tpvec(ld_a(0, ptr)); } \
283 inline _Tpvec v_load_low(const _Tp* ptr) \
284 { return _Tpvec(vec_ld_l8(ptr)); } \
285 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
286 { return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \
287 inline void v_store(_Tp* ptr, const _Tpvec& a) \
288 { st(a.val, 0, ptr); } \
289 inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
290 { st_a(a.val, 0, ptr); } \
291 inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
292 { st_a(a.val, 0, ptr); } \
293 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
294 { if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
295 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
296 { vec_st_l8(a.val, ptr); } \
297 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
298 { vec_st_h8(a.val, ptr); }
299 
300 // working around gcc bug for aligned ld/st
301 // if runtime check for vec_ld/st fail we failback to unaligned ld/st
302 // https://github.com/opencv/opencv/issues/13211
303 #ifdef CV_COMPILER_VSX_BROKEN_ALIGNED
304  #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
305  OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vsx_ld, vsx_st, vsx_st)
306 #else
307  #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
308  OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
309 #endif
310 
311 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint8x16, uchar)
312 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int8x16, schar)
313 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint16x8, ushort)
314 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int16x8, short)
315 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint32x4, uint)
316 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int32x4, int)
317 OPENCV_HAL_IMPL_VSX_LOADSTORE(v_float32x4, float)
318 
319 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_float64x2, double, vsx_ld, vsx_ld, vsx_st, vsx_st)
320 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_uint64x2, uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
321 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_int64x2, int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
322 
323 
325 /* de&interleave */
326 #define OPENCV_HAL_IMPL_VSX_INTERLEAVE(_Tp, _Tpvec) \
327 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
328 { vec_ld_deinterleave(ptr, a.val, b.val);} \
329 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \
330  _Tpvec& b, _Tpvec& c) \
331 { vec_ld_deinterleave(ptr, a.val, b.val, c.val); } \
332 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \
333  _Tpvec& c, _Tpvec& d) \
334 { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \
335 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
336  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
337 { vec_st_interleave(a.val, b.val, ptr); } \
338 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \
339  const _Tpvec& b, const _Tpvec& c, \
340  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
341 { vec_st_interleave(a.val, b.val, c.val, ptr); } \
342 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
343  const _Tpvec& c, const _Tpvec& d, \
344  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
345 { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
346 
347 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
348 OPENCV_HAL_IMPL_VSX_INTERLEAVE(schar, v_int8x16)
349 OPENCV_HAL_IMPL_VSX_INTERLEAVE(ushort, v_uint16x8)
350 OPENCV_HAL_IMPL_VSX_INTERLEAVE(short, v_int16x8)
351 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
352 OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
353 OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
354 OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
355 OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2)
356 OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2)
357 
358 /* Expand */
359 #define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \
360 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
361 { \
362  b0.val = fh(a.val); \
363  b1.val = fl(a.val); \
364 } \
365 inline _Tpwvec v_expand_low(const _Tpvec& a) \
366 { return _Tpwvec(fh(a.val)); } \
367 inline _Tpwvec v_expand_high(const _Tpvec& a) \
368 { return _Tpwvec(fl(a.val)); } \
369 inline _Tpwvec v_load_expand(const _Tp* ptr) \
370 { return _Tpwvec(fh(vec_ld_l8(ptr))); }
371 
372 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu)
373 OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh)
374 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint16x8, v_uint32x4, ushort, vec_unpacklu, vec_unpackhu)
375 OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh)
376 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
377 OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
378 
379 /* Load and zero expand a 4 byte value into the second dword, first is don't care. */
380 #if !defined(CV_COMPILER_VSX_BROKEN_ASM)
381  #define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory");
382 #else
383  /* This is compiler-agnostic, but will introduce an unneeded splat on the critical path. */
384  #define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr));
385 #endif
386 
387 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
388 {
389  // Zero-extend the extra 24B instead of unpacking. Usually faster in small kernel
390  // Likewise note, value is zero extended and upper 4 bytes are zero'ed.
391  vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12};
392  vec_uchar16 out;
393 
394  _LXSIWZX(out, ptr, vec_uchar16);
395  out = vec_perm(out, out, pmu);
396  return v_uint32x4((vec_uint4)out);
397 }
398 
399 inline v_int32x4 v_load_expand_q(const schar* ptr)
400 {
401  vec_char16 out;
402  vec_short8 outs;
403  vec_int4 outw;
404 
405  _LXSIWZX(out, ptr, vec_char16);
406  outs = vec_unpackl(out);
407  outw = vec_unpackh(outs);
408  return v_int32x4(outw);
409 }
410 
411 /* pack */
412 #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack) \
413 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
414 { \
415  return _Tpvec(pkfnc(a.val, b.val)); \
416 } \
417 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
418 { \
419  vec_st_l8(pkfnc(a.val, a.val), ptr); \
420 } \
421 template<int n> \
422 inline _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
423 { \
424  const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
425  const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
426  return _Tpvec(pkfnc(sfnc(addfnc(a.val, delta), vn), sfnc(addfnc(b.val, delta), vn))); \
427 } \
428 template<int n> \
429 inline void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
430 { \
431  const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
432  const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
433  vec_st_l8(pkfnc(sfnc(addfnc(a.val, delta), vn), delta), ptr); \
434 }
435 
436 OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_uint16x8, unsigned short, unsigned short,
437  vec_sr, vec_packs, vec_adds, pack)
438 OPENCV_HAL_IMPL_VSX_PACK(v_int8x16, schar, v_int16x8, unsigned short, short,
439  vec_sra, vec_packs, vec_adds, pack)
440 
441 OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_uint32x4, unsigned int, unsigned int,
442  vec_sr, vec_packs, vec_add, pack)
443 OPENCV_HAL_IMPL_VSX_PACK(v_int16x8, short, v_int32x4, unsigned int, int,
444  vec_sra, vec_packs, vec_add, pack)
445 
446 OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_uint64x2, unsigned long long, unsigned long long,
447  vec_sr, vec_pack, vec_add, pack)
448 OPENCV_HAL_IMPL_VSX_PACK(v_int32x4, int, v_int64x2, unsigned long long, long long,
449  vec_sra, vec_pack, vec_add, pack)
450 
451 OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_int16x8, unsigned short, short,
452  vec_sra, vec_packsu, vec_adds, pack_u)
453 OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
454  vec_sra, vec_packsu, vec_add, pack_u)
455 // Following variant is not implemented on other platforms:
456 //OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
457 // vec_sra, vec_packsu, vec_add, pack_u)
458 
459 // pack boolean
460 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
461 {
462  vec_uchar16 ab = vec_pack(a.val, b.val);
463  return v_uint8x16(ab);
464 }
465 
466 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
467  const v_uint32x4& c, const v_uint32x4& d)
468 {
469  vec_ushort8 ab = vec_pack(a.val, b.val);
470  vec_ushort8 cd = vec_pack(c.val, d.val);
471  return v_uint8x16(vec_pack(ab, cd));
472 }
473 
474 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
475  const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
476  const v_uint64x2& g, const v_uint64x2& h)
477 {
478  vec_uint4 ab = vec_pack(a.val, b.val);
479  vec_uint4 cd = vec_pack(c.val, d.val);
480  vec_uint4 ef = vec_pack(e.val, f.val);
481  vec_uint4 gh = vec_pack(g.val, h.val);
482 
483  vec_ushort8 abcd = vec_pack(ab, cd);
484  vec_ushort8 efgh = vec_pack(ef, gh);
485  return v_uint8x16(vec_pack(abcd, efgh));
486 }
487 
488 /* Recombine */
489 template <typename _Tpvec>
490 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
491 {
492  b0.val = vec_mergeh(a0.val, a1.val);
493  b1.val = vec_mergel(a0.val, a1.val);
494 }
495 
496 template <typename _Tpvec>
497 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)
498 { return _Tpvec(vec_mergesql(a.val, b.val)); }
499 
500 template <typename _Tpvec>
501 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)
502 { return _Tpvec(vec_mergesqh(a.val, b.val)); }
503 
504 template <typename _Tpvec>
505 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
506 {
507  c.val = vec_mergesqh(a.val, b.val);
508  d.val = vec_mergesql(a.val, b.val);
509 }
510 
512 
513 /* Element-wise binary and unary operations */
515 #define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \
516 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
517 { return _Tpvec(intrin(a.val, b.val)); } \
518 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
519 { a.val = intrin(a.val, b.val); return a; }
520 
521 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
522 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
523 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
524 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
525 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
526 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
527 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
528 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
529 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
530 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
531 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
532 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
533 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
534 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
535 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
536 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
537 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
538 OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
539 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
540 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
541 OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
542 OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
543 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
544 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
545 OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
546 OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
547 
548 // saturating multiply
549 #define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
550  inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
551  { \
552  _Tpwvec c, d; \
553  v_mul_expand(a, b, c, d); \
554  return v_pack(c, d); \
555  } \
556  inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
557  { a = a * b; return a; }
558 
559 OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8)
560 OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
561 OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int16x8, v_int32x4)
562 OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint16x8, v_uint32x4)
563 
564 template<typename Tvec, typename Twvec>
565 inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
566 {
567  Twvec p0 = Twvec(vec_mule(a.val, b.val));
568  Twvec p1 = Twvec(vec_mulo(a.val, b.val));
569  v_zip(p0, p1, c, d);
570 }
571 
572 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
573 {
574  vec_int4 p0 = vec_mule(a.val, b.val);
575  vec_int4 p1 = vec_mulo(a.val, b.val);
576  static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
577  return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm));
578 }
579 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
580 {
581  vec_uint4 p0 = vec_mule(a.val, b.val);
582  vec_uint4 p1 = vec_mulo(a.val, b.val);
583  static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
584  return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm));
585 }
586 
588 #define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin) \
589 template<typename _Tpvec> \
590 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
591 { return _Tpvec(intrin(a.val, b.val)); }
592 
593 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
594 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
595 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
596 
597 
598 #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
599 inline _Tpvec operator << (const _Tpvec& a, int imm) \
600 { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
601 inline _Tpvec operator >> (const _Tpvec& a, int imm) \
602 { return _Tpvec(shr(a.val, splfunc(imm))); } \
603 template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
604 { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
605 template<int imm> inline _Tpvec v_shr(const _Tpvec& a) \
606 { return _Tpvec(shr(a.val, splfunc(imm))); }
607 
608 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
609 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
610 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
611 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
612 // algebraic right shift
613 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
614 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
615 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
616 OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
617 
618 
619 #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
620 OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \
621 OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \
622 OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \
623 inline _Tpvec operator ~ (const _Tpvec& a) \
624 { return _Tpvec(vec_not(a.val)); }
625 
626 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
627 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int8x16)
628 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint16x8)
629 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int16x8)
630 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint32x4)
631 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int32x4)
632 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint64x2)
633 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int64x2)
634 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float32x4)
635 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float64x2)
636 
637 
638 #define OPENCV_HAL_IMPL_VSX_SELECT(_Tpvec, cast) \
639 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
640 { return _Tpvec(vec_sel(b.val, a.val, cast(mask.val))); }
641 
642 OPENCV_HAL_IMPL_VSX_SELECT(v_uint8x16, vec_bchar16_c)
643 OPENCV_HAL_IMPL_VSX_SELECT(v_int8x16, vec_bchar16_c)
644 OPENCV_HAL_IMPL_VSX_SELECT(v_uint16x8, vec_bshort8_c)
645 OPENCV_HAL_IMPL_VSX_SELECT(v_int16x8, vec_bshort8_c)
646 OPENCV_HAL_IMPL_VSX_SELECT(v_uint32x4, vec_bint4_c)
647 OPENCV_HAL_IMPL_VSX_SELECT(v_int32x4, vec_bint4_c)
648 OPENCV_HAL_IMPL_VSX_SELECT(v_float32x4, vec_bint4_c)
649 OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
650 
651 
652 #define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \
653 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
654 { return _Tpvec(vec_cmpeq(a.val, b.val)); } \
655 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
656 { return _Tpvec(vec_cmpne(a.val, b.val)); } \
657 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
658 { return _Tpvec(vec_cmplt(a.val, b.val)); } \
659 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
660 { return _Tpvec(vec_cmpgt(a.val, b.val)); } \
661 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
662 { return _Tpvec(vec_cmple(a.val, b.val)); } \
663 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
664 { return _Tpvec(vec_cmpge(a.val, b.val)); }
665 
666 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
667 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int8x16)
668 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint16x8)
669 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int16x8)
670 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint32x4)
671 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int32x4)
672 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float32x4)
673 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
674 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
675 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
676 
677 inline v_float32x4 v_not_nan(const v_float32x4& a)
678 { return v_float32x4(vec_cmpeq(a.val, a.val)); }
679 inline v_float64x2 v_not_nan(const v_float64x2& a)
680 { return v_float64x2(vec_cmpeq(a.val, a.val)); }
681 
683 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
684 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
685 
686 
687 #define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \
688 template<int imm> \
689 inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
690 { \
691  const int wd = imm * sizeof(typename _Tpvec::lane_type); \
692  if (wd > 15) \
693  return _Tpvec::zero(); \
694  return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
695 }
696 
697 #define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \
698 OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
699 OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
700 
701 OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
702 OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16, vec_char16)
703 OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
704 OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8)
705 OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
706 OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4)
707 OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
708 OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
709 OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2)
710 OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)
711 
712 template<int imm, typename _Tpvec>
713 inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
714 {
715  enum { CV_SHIFT = 16 - imm * (sizeof(typename _Tpvec::lane_type)) };
716  if (CV_SHIFT == 16)
717  return a;
718 #ifdef __IBMCPP__
719  return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT & 15));
720 #else
721  return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT));
722 #endif
723 }
724 
725 template<int imm, typename _Tpvec>
726 inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
727 {
728  enum { CV_SHIFT = imm * (sizeof(typename _Tpvec::lane_type)) };
729  if (CV_SHIFT == 16)
730  return b;
731  return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
732 }
733 
734 #define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2) \
735 template<int imm> \
736 inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
737 { \
738  if (imm == 1) \
739  return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \
740  return imm ? b : a; \
741 }
742 
743 #define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec) \
744 OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left, b, a) \
745 OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)
746 
747 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
748 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
749 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
750 
751 /* Reverse */
752 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
753 {
754  static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
755  vec_uchar16 vec = (vec_uchar16)a.val;
756  return v_uint8x16(vec_perm(vec, vec, perm));
757 }
758 
759 inline v_int8x16 v_reverse(const v_int8x16 &a)
760 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
761 
762 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
763 {
764  static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
765  vec_uchar16 vec = (vec_uchar16)a.val;
766  return v_reinterpret_as_u16(v_uint8x16(vec_perm(vec, vec, perm)));
767 }
768 
769 inline v_int16x8 v_reverse(const v_int16x8 &a)
770 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
771 
772 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
773 {
774  static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
775  vec_uchar16 vec = (vec_uchar16)a.val;
776  return v_reinterpret_as_u32(v_uint8x16(vec_perm(vec, vec, perm)));
777 }
778 
779 inline v_int32x4 v_reverse(const v_int32x4 &a)
780 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
781 
782 inline v_float32x4 v_reverse(const v_float32x4 &a)
783 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
784 
785 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
786 {
787  static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
788  vec_uchar16 vec = (vec_uchar16)a.val;
789  return v_reinterpret_as_u64(v_uint8x16(vec_perm(vec, vec, perm)));
790 }
791 
792 inline v_int64x2 v_reverse(const v_int64x2 &a)
793 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
794 
795 inline v_float64x2 v_reverse(const v_float64x2 &a)
796 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
797 
798 /* Extract */
799 template<int s, typename _Tpvec>
800 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
801 { return v_rotate_right<s>(a, b); }
802 
804 
806 inline uint v_reduce_sum(const v_uint8x16& a)
807 {
808  const vec_uint4 zero4 = vec_uint4_z;
809  vec_uint4 sum4 = vec_sum4s(a.val, zero4);
810  return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
811 }
812 inline int v_reduce_sum(const v_int8x16& a)
813 {
814  const vec_int4 zero4 = vec_int4_z;
815  vec_int4 sum4 = vec_sum4s(a.val, zero4);
816  return (int)vec_extract(vec_sums(sum4, zero4), 3);
817 }
818 inline int v_reduce_sum(const v_int16x8& a)
819 {
820  const vec_int4 zero = vec_int4_z;
821  return saturate_cast<int>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
822 }
823 inline uint v_reduce_sum(const v_uint16x8& a)
824 {
825  const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
826  return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
827 }
828 
829 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
830 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
831 { \
832  const _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
833  return vec_extract(func(rs, vec_sld(rs, rs, 4)), 0); \
834 }
835 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, sum, vec_add)
836 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, max, vec_max)
837 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, min, vec_min)
838 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, sum, vec_add)
839 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, max, vec_max)
840 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, min, vec_min)
841 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
842 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
843 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)
844 
845 inline uint64 v_reduce_sum(const v_uint64x2& a)
846 {
847  return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
848 }
849 inline int64 v_reduce_sum(const v_int64x2& a)
850 {
851  return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
852 }
853 inline double v_reduce_sum(const v_float64x2& a)
854 {
855  return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
856 }
857 
858 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \
859 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
860 { \
861  _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
862  rs = func(rs, vec_sld(rs, rs, 4)); \
863  return vec_extract(func(rs, vec_sld(rs, rs, 2)), 0); \
864 }
865 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, max, vec_max)
866 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
867 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
868 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)
869 
870 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
871 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
872 { \
873  _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
874  rs = func(rs, vec_sld(rs, rs, 4)); \
875  rs = func(rs, vec_sld(rs, rs, 2)); \
876  return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0); \
877 }
878 OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, max, vec_max)
879 OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, min, vec_min)
880 OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, max, vec_max)
881 OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, min, vec_min)
882 
883 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
884  const v_float32x4& c, const v_float32x4& d)
885 {
886  vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val));
887  ac = vec_add(ac, vec_sld(ac, ac, 8));
888 
889  vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val));
890  bd = vec_add(bd, vec_sld(bd, bd, 8));
891  return v_float32x4(vec_mergeh(ac, bd));
892 }
893 
894 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
895 {
896  const vec_uint4 zero4 = vec_uint4_z;
897  vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
898  return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
899 }
900 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
901 {
902  const vec_int4 zero4 = vec_int4_z;
903  vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
904  vec_int4 sum4 = vec_sum4s(ad, zero4);
905  return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
906 }
907 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
908 {
909  vec_ushort8 ad = vec_absd(a.val, b.val);
910  VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)) + vec_int4_c(vec_unpacklu(ad)), vec_int4_z);
911  return (unsigned)vec_extract(sum, 3);
912 }
913 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
914 {
915  const vec_int4 zero4 = vec_int4_z;
916  vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
917  vec_int4 sum4 = vec_sum4s(ad, zero4);
918  return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
919 }
920 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
921 {
922  const vec_uint4 ad = vec_absd(a.val, b.val);
923  const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
924  return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
925 }
926 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
927 {
928  vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
929  return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
930 }
931 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
932 {
933  const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
934  const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
935  return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
936 }
937 
939 inline v_uint8x16 v_popcount(const v_uint8x16& a)
940 { return v_uint8x16(vec_popcntu(a.val)); }
941 inline v_uint8x16 v_popcount(const v_int8x16& a)
942 { return v_uint8x16(vec_popcntu(a.val)); }
943 inline v_uint16x8 v_popcount(const v_uint16x8& a)
944 { return v_uint16x8(vec_popcntu(a.val)); }
945 inline v_uint16x8 v_popcount(const v_int16x8& a)
946 { return v_uint16x8(vec_popcntu(a.val)); }
947 inline v_uint32x4 v_popcount(const v_uint32x4& a)
948 { return v_uint32x4(vec_popcntu(a.val)); }
949 inline v_uint32x4 v_popcount(const v_int32x4& a)
950 { return v_uint32x4(vec_popcntu(a.val)); }
951 inline v_uint64x2 v_popcount(const v_uint64x2& a)
952 { return v_uint64x2(vec_popcntu(a.val)); }
953 inline v_uint64x2 v_popcount(const v_int64x2& a)
954 { return v_uint64x2(vec_popcntu(a.val)); }
955 
957 inline int v_signmask(const v_uint8x16& a)
958 {
959  static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
960  return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
961 }
962 inline int v_signmask(const v_int8x16& a)
963 { return v_signmask(v_reinterpret_as_u8(a)); }
964 
965 inline int v_signmask(const v_int16x8& a)
966 {
967  static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
968  return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
969 }
970 inline int v_signmask(const v_uint16x8& a)
971 { return v_signmask(v_reinterpret_as_s16(a)); }
972 
973 inline int v_signmask(const v_int32x4& a)
974 {
975  static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
976  return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
977 }
978 inline int v_signmask(const v_uint32x4& a)
979 { return v_signmask(v_reinterpret_as_s32(a)); }
980 inline int v_signmask(const v_float32x4& a)
981 { return v_signmask(v_reinterpret_as_s32(a)); }
982 
983 inline int v_signmask(const v_int64x2& a)
984 {
985  VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
986  return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
987 }
988 inline int v_signmask(const v_uint64x2& a)
989 { return v_signmask(v_reinterpret_as_s64(a)); }
990 inline int v_signmask(const v_float64x2& a)
991 { return v_signmask(v_reinterpret_as_s64(a)); }
992 
993 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
994 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
995 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
996 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
997 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
998 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
999 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1000 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1001 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1002 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1003 
1004 template<typename _Tpvec>
1005 inline bool v_check_all(const _Tpvec& a)
1006 { return vec_all_lt(a.val, _Tpvec::zero().val); }
1007 inline bool v_check_all(const v_uint8x16& a)
1008 { return v_check_all(v_reinterpret_as_s8(a)); }
1009 inline bool v_check_all(const v_uint16x8& a)
1010 { return v_check_all(v_reinterpret_as_s16(a)); }
1011 inline bool v_check_all(const v_uint32x4& a)
1012 { return v_check_all(v_reinterpret_as_s32(a)); }
1013 inline bool v_check_all(const v_uint64x2& a)
1014 { return v_check_all(v_reinterpret_as_s64(a)); }
1015 inline bool v_check_all(const v_float32x4& a)
1016 { return v_check_all(v_reinterpret_as_s32(a)); }
1017 inline bool v_check_all(const v_float64x2& a)
1018 { return v_check_all(v_reinterpret_as_s64(a)); }
1019 
1020 template<typename _Tpvec>
1021 inline bool v_check_any(const _Tpvec& a)
1022 { return vec_any_lt(a.val, _Tpvec::zero().val); }
1023 inline bool v_check_any(const v_uint8x16& a)
1024 { return v_check_any(v_reinterpret_as_s8(a)); }
1025 inline bool v_check_any(const v_uint16x8& a)
1026 { return v_check_any(v_reinterpret_as_s16(a)); }
1027 inline bool v_check_any(const v_uint32x4& a)
1028 { return v_check_any(v_reinterpret_as_s32(a)); }
1029 inline bool v_check_any(const v_uint64x2& a)
1030 { return v_check_any(v_reinterpret_as_s64(a)); }
1031 inline bool v_check_any(const v_float32x4& a)
1032 { return v_check_any(v_reinterpret_as_s32(a)); }
1033 inline bool v_check_any(const v_float64x2& a)
1034 { return v_check_any(v_reinterpret_as_s64(a)); }
1035 
1037 
1039 inline v_float32x4 v_sqrt(const v_float32x4& x)
1040 { return v_float32x4(vec_sqrt(x.val)); }
1041 inline v_float64x2 v_sqrt(const v_float64x2& x)
1042 { return v_float64x2(vec_sqrt(x.val)); }
1043 
1044 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1045 { return v_float32x4(vec_rsqrt(x.val)); }
1046 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1047 { return v_float64x2(vec_rsqrt(x.val)); }
1048 
1049 #define OPENCV_HAL_IMPL_VSX_MULADD(_Tpvec) \
1050 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1051 { return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
1052 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1053 { return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \
1054 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1055 { return _Tpvec(vec_madd(a.val, b.val, c.val)); } \
1056 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1057 { return _Tpvec(vec_madd(a.val, b.val, c.val)); }
1058 
1059 OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
1060 OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
1061 
1062 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1063 { return a * b + c; }
1064 
1065 // TODO: exp, log, sin, cos
1066 
1068 inline v_uint8x16 v_abs(const v_int8x16& x)
1069 { return v_uint8x16(vec_uchar16_c(vec_abs(x.val))); }
1070 
1071 inline v_uint16x8 v_abs(const v_int16x8& x)
1072 { return v_uint16x8(vec_ushort8_c(vec_abs(x.val))); }
1073 
1074 inline v_uint32x4 v_abs(const v_int32x4& x)
1075 { return v_uint32x4(vec_uint4_c(vec_abs(x.val))); }
1076 
1077 inline v_float32x4 v_abs(const v_float32x4& x)
1078 { return v_float32x4(vec_abs(x.val)); }
1079 
1080 inline v_float64x2 v_abs(const v_float64x2& x)
1081 { return v_float64x2(vec_abs(x.val)); }
1082 
1084 // unsigned
1085 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
1086 
1087 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1088 { return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1089 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1090 { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1091 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1092 { return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
1093 
1094 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
1095 { return v_abs(a - b); }
1096 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
1097 { return v_abs(a - b); }
1098 
1100 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1101 { return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
1102 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1103 { return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }
1104 
1106 
1108 inline v_int32x4 v_round(const v_float32x4& a)
1109 { return v_int32x4(vec_cts(vec_rint(a.val))); }
1110 
1111 inline v_int32x4 v_round(const v_float64x2& a)
1112 { return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
1113 
1114 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1115 { return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
1116 
1117 inline v_int32x4 v_floor(const v_float32x4& a)
1118 { return v_int32x4(vec_cts(vec_floor(a.val))); }
1119 
1120 inline v_int32x4 v_floor(const v_float64x2& a)
1121 { return v_int32x4(vec_mergesqo(vec_ctso(vec_floor(a.val)), vec_int4_z)); }
1122 
1123 inline v_int32x4 v_ceil(const v_float32x4& a)
1124 { return v_int32x4(vec_cts(vec_ceil(a.val))); }
1125 
1126 inline v_int32x4 v_ceil(const v_float64x2& a)
1127 { return v_int32x4(vec_mergesqo(vec_ctso(vec_ceil(a.val)), vec_int4_z)); }
1128 
1129 inline v_int32x4 v_trunc(const v_float32x4& a)
1130 { return v_int32x4(vec_cts(a.val)); }
1131 
1132 inline v_int32x4 v_trunc(const v_float64x2& a)
1133 { return v_int32x4(vec_mergesqo(vec_ctso(a.val), vec_int4_z)); }
1134 
1136 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1137 { return v_float32x4(vec_ctf(a.val)); }
1138 
1139 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1140 { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
1141 
1142 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1143 { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
1144 
1145 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1146 { return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
1147 
1148 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1149 { return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
1150 
1151 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1152 { return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
1153 
1154 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1155 { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
1156 
1157 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1158 { return v_float64x2(vec_ctd(a.val)); }
1159 
1161 
1162 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1163 {
1164  return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]],
1165  tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
1166 }
1167 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1168 {
1169  return v_reinterpret_as_s8(v_int16x8(*(const short*)(tab+idx[0]), *(const short*)(tab+idx[1]), *(const short*)(tab+idx[2]), *(const short*)(tab+idx[3]),
1170  *(const short*)(tab+idx[4]), *(const short*)(tab+idx[5]), *(const short*)(tab+idx[6]), *(const short*)(tab+idx[7])));
1171 }
1172 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1173 {
1174  return v_reinterpret_as_s8(v_int32x4(*(const int*)(tab+idx[0]), *(const int*)(tab+idx[1]), *(const int*)(tab+idx[2]), *(const int*)(tab+idx[3])));
1175 }
1176 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
1177 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
1178 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
1179 
1180 inline v_int16x8 v_lut(const short* tab, const int* idx)
1181 {
1182  return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
1183 }
1184 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1185 {
1186  return v_reinterpret_as_s16(v_int32x4(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
1187 }
1188 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1189 {
1190  return v_reinterpret_as_s16(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
1191 }
1192 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short*)tab, idx)); }
1193 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short*)tab, idx)); }
1194 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short*)tab, idx)); }
1195 
1196 inline v_int32x4 v_lut(const int* tab, const int* idx)
1197 {
1198  return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1199 }
1200 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1201 {
1202  return v_reinterpret_as_s32(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
1203 }
1204 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1205 {
1206  return v_int32x4(vsx_ld(0, tab + idx[0]));
1207 }
1208 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int*)tab, idx)); }
1209 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int*)tab, idx)); }
1210 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int*)tab, idx)); }
1211 
1212 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1213 {
1214  return v_int64x2(tab[idx[0]], tab[idx[1]]);
1215 }
1216 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1217 {
1218  return v_int64x2(vsx_ld2(0, tab + idx[0]));
1219 }
1220 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1221 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1222 
1223 inline v_float32x4 v_lut(const float* tab, const int* idx)
1224 {
1225  return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1226 }
1227 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int*)tab, idx)); }
1228 inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_load(tab + *idx); }
1229 
1230 inline v_float64x2 v_lut(const double* tab, const int* idx)
1231 {
1232  return v_float64x2(tab[idx[0]], tab[idx[1]]);
1233 }
1234 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_load(tab + *idx); }
1235 
1236 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1237 {
1238  const int idx[4] = {
1239  vec_extract(idxvec.val, 0),
1240  vec_extract(idxvec.val, 1),
1241  vec_extract(idxvec.val, 2),
1242  vec_extract(idxvec.val, 3)
1243  };
1244  return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1245 }
1246 
1247 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1248 {
1249  const int idx[4] = {
1250  vec_extract(idxvec.val, 0),
1251  vec_extract(idxvec.val, 1),
1252  vec_extract(idxvec.val, 2),
1253  vec_extract(idxvec.val, 3)
1254  };
1255  return v_uint32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1256 }
1257 
1258 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1259 {
1260  const int idx[4] = {
1261  vec_extract(idxvec.val, 0),
1262  vec_extract(idxvec.val, 1),
1263  vec_extract(idxvec.val, 2),
1264  vec_extract(idxvec.val, 3)
1265  };
1266  return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1267 }
1268 
1269 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1270 {
1271  const int idx[2] = {
1272  vec_extract(idxvec.val, 0),
1273  vec_extract(idxvec.val, 1)
1274  };
1275  return v_float64x2(tab[idx[0]], tab[idx[1]]);
1276 }
1277 
1278 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1279 {
1280  vec_float4 xy0 = vec_ld_l8(tab + vec_extract(idxvec.val, 0));
1281  vec_float4 xy1 = vec_ld_l8(tab + vec_extract(idxvec.val, 1));
1282  vec_float4 xy2 = vec_ld_l8(tab + vec_extract(idxvec.val, 2));
1283  vec_float4 xy3 = vec_ld_l8(tab + vec_extract(idxvec.val, 3));
1284  vec_float4 xy02 = vec_mergeh(xy0, xy2); // x0, x2, y0, y2
1285  vec_float4 xy13 = vec_mergeh(xy1, xy3); // x1, x3, y1, y3
1286  x.val = vec_mergeh(xy02, xy13);
1287  y.val = vec_mergel(xy02, xy13);
1288 }
1289 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1290 {
1291  vec_double2 xy0 = vsx_ld(vec_extract(idxvec.val, 0), tab);
1292  vec_double2 xy1 = vsx_ld(vec_extract(idxvec.val, 1), tab);
1293  x.val = vec_mergeh(xy0, xy1);
1294  y.val = vec_mergel(xy0, xy1);
1295 }
1296 
1297 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
1298 {
1299  static const vec_uchar16 perm = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15};
1300  return v_int8x16(vec_perm(vec.val, vec.val, perm));
1301 }
1302 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
1303 { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1304 
1305 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
1306 {
1307  static const vec_uchar16 perm = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
1308  return v_int8x16(vec_perm(vec.val, vec.val, perm));
1309 }
1310 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
1311 { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1312 
1313 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
1314 {
1315  static const vec_uchar16 perm = {0,1, 4,5, 2,3, 6,7, 8,9, 12,13, 10,11, 14,15};
1316  return v_int16x8(vec_perm(vec.val, vec.val, perm));
1317 }
1318 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
1319 { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1320 
1321 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
1322 {
1323  static const vec_uchar16 perm = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
1324  return v_int16x8(vec_perm(vec.val, vec.val, perm));
1325 }
1326 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
1327 { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1328 
1329 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
1330 {
1331  static const vec_uchar16 perm = {0,1,2,3, 8,9,10,11, 4,5,6,7, 12,13,14,15};
1332  return v_int32x4(vec_perm(vec.val, vec.val, perm));
1333 }
1334 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
1335 { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1336 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
1337 { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1338 
1339 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
1340 {
1341  static const vec_uchar16 perm = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 15, 15, 15};
1342  return v_int8x16(vec_perm(vec.val, vec.val, perm));
1343 }
1344 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
1345 { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1346 
1347 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
1348 {
1349  static const vec_uchar16 perm = {0,1, 2,3, 4,5, 8,9, 10,11, 12,13, 14,15, 14,15};
1350  return v_int16x8(vec_perm(vec.val, vec.val, perm));
1351 }
1352 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
1353 { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1354 
1355 inline v_int32x4 v_pack_triplets(const v_int32x4& vec)
1356 { return vec; }
1357 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec)
1358 { return vec; }
1359 inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
1360 { return vec; }
1361 
1363 
1364 inline v_float32x4 v_load_expand(const hfloat* ptr)
1365 {
1366  vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
1367 #if CV_VSX3 && defined(vec_extract_fp_from_shorth)
1368  return v_float32x4(vec_extract_fp_from_shorth(vf16));
1369 #elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
1370  vec_float4 vf32;
1371  __asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wa" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
1372  return v_float32x4(vf32);
1373 #else
1374  const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
1375  const vec_int4 signmask = vec_int4_sp(0x80000000);
1376  const vec_int4 maxexp = vec_int4_sp(0x7c000000);
1377  const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
1378 
1379  vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
1380  vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
1381  vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta
1382  vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
1383 
1384  t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e)));
1385  vec_bint4 zmask = vec_cmpeq(e, z);
1386  vec_int4 ft = vec_sel(t, zt, zmask);
1387  return v_float32x4(vec_float4_c(vec_or(ft, sign)));
1388 #endif
1389 }
1390 
1391 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
1392 {
1393 // fixme: Is there any builtin op or intrinsic that cover "xvcvsphp"?
1394 #if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
1395  vec_ushort8 vf16;
1396  __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (v.val));
1397  vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
1398 #else
1399  const vec_int4 signmask = vec_int4_sp(0x80000000);
1400  const vec_int4 rval = vec_int4_sp(0x3f000000);
1401 
1402  vec_int4 t = vec_int4_c(v.val);
1403  vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
1404  t = vec_and(vec_nor(signmask, signmask), t);
1405 
1406  vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
1407  vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
1408  vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
1409  vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
1410  vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
1411  tt = vec_sub(tt, rval);
1412  vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
1413  vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
1414  nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
1415  t = vec_sel(nt, tt, tinymask);
1416  t = vec_sel(naninf, t, finitemask);
1417  t = vec_or(t, sign);
1418  vec_st_l8(vec_packs(t, t), ptr);
1419 #endif
1420 }
1421 
1422 inline void v_cleanup() {}
1423 
1424 
1429 
1431 // 16 >> 32
1432 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
1433 { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
1434 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1435 { return v_int32x4(vec_msum(a.val, b.val, c.val)); }
1436 
1437 // 32 >> 64
1438 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
1439 {
1440  vec_dword2 even = vec_mule(a.val, b.val);
1441  vec_dword2 odd = vec_mulo(a.val, b.val);
1442  return v_int64x2(vec_add(even, odd));
1443 }
1444 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1445 { return v_dotprod(a, b) + c; }
1446 
1447 // 8 >> 32
1448 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1449 { return v_uint32x4(vec_msum(a.val, b.val, c.val)); }
1450 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
1451 { return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); }
1452 
1453 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
1454 {
1455  const vec_ushort8 eight = vec_ushort8_sp(8);
1456  vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
1457  vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
1458  vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
1459  vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
1460  return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
1461 }
1462 
1463 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1464 {
1465  const vec_ushort8 eight = vec_ushort8_sp(8);
1466  vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
1467  vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
1468  vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
1469  vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
1470  return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val)));
1471 }
1472 
1473 // 16 >> 64
1474 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
1475 {
1476  const vec_uint4 zero = vec_uint4_z;
1477  vec_uint4 even = vec_mule(a.val, b.val);
1478  vec_uint4 odd = vec_mulo(a.val, b.val);
1479  vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero);
1480  vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero);
1481  vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero);
1482  vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero);
1483  vec_udword2 s0 = vec_add(e0, o0);
1484  vec_udword2 s1 = vec_add(e1, o1);
1485  return v_uint64x2(vec_add(s0, s1));
1486 }
1487 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1488 { return v_dotprod_expand(a, b) + c; }
1489 
1490 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
1491 {
1492  v_int32x4 prod = v_dotprod(a, b);
1493  v_int64x2 c, d;
1494  v_expand(prod, c, d);
1495  return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
1496 }
1497 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1498 { return v_dotprod_expand(a, b) + c; }
1499 
1500 // 32 >> 64f
1501 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
1502 { return v_cvt_f64(v_dotprod(a, b)); }
1503 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1504 { return v_dotprod_expand(a, b) + c; }
1505 
1507 
1508 // 16 >> 32
1509 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
1510 { return v_dotprod(a, b); }
1511 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1512 { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
1513 // 32 >> 64
1514 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
1515 { return v_dotprod(a, b); }
1516 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1517 { return v_dotprod(a, b, c); }
1518 
1519 // 8 >> 32
1520 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1521 { return v_dotprod_expand(a, b); }
1522 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1523 { return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
1524 
1525 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1526 {
1527  vec_short8 a0 = vec_unpackh(a.val);
1528  vec_short8 a1 = vec_unpackl(a.val);
1529  vec_short8 b0 = vec_unpackh(b.val);
1530  vec_short8 b1 = vec_unpackl(b.val);
1531  return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
1532 }
1533 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1534 { return v_dotprod_expand_fast(a, b) + c; }
1535 
1536 // 16 >> 64
1537 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1538 { return v_dotprod_expand(a, b); }
1539 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1540 { return v_dotprod_expand(a, b, c); }
1541 
1542 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1543 {
1544  v_int32x4 prod = v_dotprod(a, b);
1545  v_int64x2 c, d;
1546  v_expand(prod, c, d);
1547  return c + d;
1548 }
1549 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1550 { return v_dotprod_expand_fast(a, b) + c; }
1551 
1552 // 32 >> 64f
1553 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1554 { return v_dotprod_expand(a, b); }
1555 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1556 { return v_dotprod_expand(a, b, c); }
1557 
1558 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
1559  const v_float32x4& m1, const v_float32x4& m2,
1560  const v_float32x4& m3)
1561 {
1562  const vec_float4 v0 = vec_splat(v.val, 0);
1563  const vec_float4 v1 = vec_splat(v.val, 1);
1564  const vec_float4 v2 = vec_splat(v.val, 2);
1565  VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
1566  return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
1567 }
1568 
1569 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
1570  const v_float32x4& m1, const v_float32x4& m2,
1571  const v_float32x4& a)
1572 {
1573  const vec_float4 v0 = vec_splat(v.val, 0);
1574  const vec_float4 v1 = vec_splat(v.val, 1);
1575  const vec_float4 v2 = vec_splat(v.val, 2);
1576  return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
1577 }
1578 
1579 #define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
1580 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1581  const _Tpvec& a2, const _Tpvec& a3, \
1582  _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1583 { \
1584  _Tpvec2 a02 = vec_mergeh(a0.val, a2.val); \
1585  _Tpvec2 a13 = vec_mergeh(a1.val, a3.val); \
1586  b0.val = vec_mergeh(a02, a13); \
1587  b1.val = vec_mergel(a02, a13); \
1588  a02 = vec_mergel(a0.val, a2.val); \
1589  a13 = vec_mergel(a1.val, a3.val); \
1590  b2.val = vec_mergeh(a02, a13); \
1591  b3.val = vec_mergel(a02, a13); \
1592 }
1593 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4)
1594 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4)
1595 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4)
1596 
1597 template<int i, typename Tvec>
1598 inline Tvec v_broadcast_element(const Tvec& v)
1599 { return Tvec(vec_splat(v.val, i)); }
1600 
1601 
1602 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
1603 
1605 
1606 }
1607 
1608 #endif // OPENCV_HAL_VSX_HPP
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition: intrin_cpp.hpp:1554
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition: intrin_cpp.hpp:1584
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
v_reg< _Tp, n > v_combine_high(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from last elements of two vectors.
Definition: intrin_cpp.hpp:2304
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
void v_recombine(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< _Tp, n > &low, v_reg< _Tp, n > &high)
Combine two vectors from lower and higher parts of two other vectors.
Definition: intrin_cpp.hpp:2322
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
v_reg< _Tp, n > v_combine_low(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from first elements of two vectors.
Definition: intrin_cpp.hpp:2282
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract.
Definition: intrin_cpp.hpp:2371
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
int saturate_cast< int >(unsigned v)
Definition: saturate.hpp:138
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
OutputArray sum
Definition: imgproc.hpp:2882
T isnan(T... args)
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437