EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_vsx.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html
4
5#ifndef OPENCV_HAL_VSX_HPP
6#define OPENCV_HAL_VSX_HPP
7
8#include <algorithm>
9#include "opencv2/core/utility.hpp"
10
11#define CV_SIMD128 1
12#define CV_SIMD128_64F 1
13
14namespace cv
15{
16
18
19CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
20
22
23struct v_uint8x16
24{
25 typedef uchar lane_type;
26 enum { nlanes = 16 };
27 vec_uchar16 val;
28
29 explicit v_uint8x16(const vec_uchar16& v) : val(v)
30 {}
32 {}
33 v_uint8x16(vec_bchar16 v) : val(vec_uchar16_c(v))
34 {}
35 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
36 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
37 : val(vec_uchar16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
38 {}
39
40 static inline v_uint8x16 zero() { return v_uint8x16(vec_uchar16_z); }
41
42 uchar get0() const
43 { return vec_extract(val, 0); }
44};
45
46struct v_int8x16
47{
48 typedef schar lane_type;
49 enum { nlanes = 16 };
50 vec_char16 val;
51
52 explicit v_int8x16(const vec_char16& v) : val(v)
53 {}
54 v_int8x16()
55 {}
56 v_int8x16(vec_bchar16 v) : val(vec_char16_c(v))
57 {}
58 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
59 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
60 : val(vec_char16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
61 {}
62
63 static inline v_int8x16 zero() { return v_int8x16(vec_char16_z); }
64
65 schar get0() const
66 { return vec_extract(val, 0); }
67};
68
69struct v_uint16x8
70{
71 typedef ushort lane_type;
72 enum { nlanes = 8 };
73 vec_ushort8 val;
74
75 explicit v_uint16x8(const vec_ushort8& v) : val(v)
76 {}
78 {}
79 v_uint16x8(vec_bshort8 v) : val(vec_ushort8_c(v))
80 {}
81 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
82 : val(vec_ushort8_set(v0, v1, v2, v3, v4, v5, v6, v7))
83 {}
84
85 static inline v_uint16x8 zero() { return v_uint16x8(vec_ushort8_z); }
86
87 ushort get0() const
88 { return vec_extract(val, 0); }
89};
90
91struct v_int16x8
92{
93 typedef short lane_type;
94 enum { nlanes = 8 };
95 vec_short8 val;
96
97 explicit v_int16x8(const vec_short8& v) : val(v)
98 {}
99 v_int16x8()
100 {}
101 v_int16x8(vec_bshort8 v) : val(vec_short8_c(v))
102 {}
103 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
104 : val(vec_short8_set(v0, v1, v2, v3, v4, v5, v6, v7))
105 {}
106
107 static inline v_int16x8 zero() { return v_int16x8(vec_short8_z); }
108
109 short get0() const
110 { return vec_extract(val, 0); }
111};
112
113struct v_uint32x4
114{
115 typedef unsigned lane_type;
116 enum { nlanes = 4 };
117 vec_uint4 val;
118
119 explicit v_uint32x4(const vec_uint4& v) : val(v)
120 {}
121 v_uint32x4()
122 {}
123 v_uint32x4(vec_bint4 v) : val(vec_uint4_c(v))
124 {}
125 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) : val(vec_uint4_set(v0, v1, v2, v3))
126 {}
127
128 static inline v_uint32x4 zero() { return v_uint32x4(vec_uint4_z); }
129
130 uint get0() const
131 { return vec_extract(val, 0); }
132};
133
134struct v_int32x4
135{
136 typedef int lane_type;
137 enum { nlanes = 4 };
138 vec_int4 val;
139
140 explicit v_int32x4(const vec_int4& v) : val(v)
141 {}
142 v_int32x4()
143 {}
144 v_int32x4(vec_bint4 v) : val(vec_int4_c(v))
145 {}
146 v_int32x4(int v0, int v1, int v2, int v3) : val(vec_int4_set(v0, v1, v2, v3))
147 {}
148
149 static inline v_int32x4 zero() { return v_int32x4(vec_int4_z); }
150
151 int get0() const
152 { return vec_extract(val, 0); }
153};
154
155struct v_float32x4
156{
157 typedef float lane_type;
158 enum { nlanes = 4 };
159 vec_float4 val;
160
161 explicit v_float32x4(const vec_float4& v) : val(v)
162 {}
164 {}
165 v_float32x4(vec_bint4 v) : val(vec_float4_c(v))
166 {}
167 v_float32x4(float v0, float v1, float v2, float v3) : val(vec_float4_set(v0, v1, v2, v3))
168 {}
169
170 static inline v_float32x4 zero() { return v_float32x4(vec_float4_z); }
171
172 float get0() const
173 { return vec_extract(val, 0); }
174};
175
176struct v_uint64x2
177{
178 typedef uint64 lane_type;
179 enum { nlanes = 2 };
180 vec_udword2 val;
181
182 explicit v_uint64x2(const vec_udword2& v) : val(v)
183 {}
184 v_uint64x2()
185 {}
186 v_uint64x2(vec_bdword2 v) : val(vec_udword2_c(v))
187 {}
188 v_uint64x2(uint64 v0, uint64 v1) : val(vec_udword2_set(v0, v1))
189 {}
190
191 static inline v_uint64x2 zero() { return v_uint64x2(vec_udword2_z); }
192
193 uint64 get0() const
194 { return vec_extract(val, 0); }
195};
196
197struct v_int64x2
198{
199 typedef int64 lane_type;
200 enum { nlanes = 2 };
201 vec_dword2 val;
202
203 explicit v_int64x2(const vec_dword2& v) : val(v)
204 {}
205 v_int64x2()
206 {}
207 v_int64x2(vec_bdword2 v) : val(vec_dword2_c(v))
208 {}
209 v_int64x2(int64 v0, int64 v1) : val(vec_dword2_set(v0, v1))
210 {}
211
212 static inline v_int64x2 zero() { return v_int64x2(vec_dword2_z); }
213
214 int64 get0() const
215 { return vec_extract(val, 0); }
216};
217
218struct v_float64x2
219{
220 typedef double lane_type;
221 enum { nlanes = 2 };
222 vec_double2 val;
223
224 explicit v_float64x2(const vec_double2& v) : val(v)
225 {}
227 {}
228 v_float64x2(vec_bdword2 v) : val(vec_double2_c(v))
229 {}
230 v_float64x2(double v0, double v1) : val(vec_double2_set(v0, v1))
231 {}
232
233 static inline v_float64x2 zero() { return v_float64x2(vec_double2_z); }
234
235 double get0() const
236 { return vec_extract(val, 0); }
237};
238
239#define OPENCV_HAL_IMPL_VSX_EXTRACT_N(_Tpvec, _Tp) \
240template<int i> inline _Tp v_extract_n(VSX_UNUSED(_Tpvec v)) { return vec_extract(v.val, i); }
241
242OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint8x16, uchar)
243OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int8x16, schar)
244OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint16x8, ushort)
245OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int16x8, short)
246OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint32x4, uint)
247OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int32x4, int)
248OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint64x2, uint64)
249OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int64x2, int64)
250OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float32x4, float)
251OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
252
253
254
255/*
256 * clang-5 aborted during parse "vec_xxx_c" only if it's
257 * inside a function template which is defined by preprocessor macro.
258 *
259 * if vec_xxx_c defined as C++ cast, clang-5 will pass it
260*/
261#define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \
262inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); } \
263inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \
264template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \
265{ return _Tpvec((cast)a.val); }
266
267OPENCV_HAL_IMPL_VSX_INITVEC(v_uint8x16, uchar, u8, vec_uchar16)
268OPENCV_HAL_IMPL_VSX_INITVEC(v_int8x16, schar, s8, vec_char16)
269OPENCV_HAL_IMPL_VSX_INITVEC(v_uint16x8, ushort, u16, vec_ushort8)
270OPENCV_HAL_IMPL_VSX_INITVEC(v_int16x8, short, s16, vec_short8)
271OPENCV_HAL_IMPL_VSX_INITVEC(v_uint32x4, uint, u32, vec_uint4)
272OPENCV_HAL_IMPL_VSX_INITVEC(v_int32x4, int, s32, vec_int4)
273OPENCV_HAL_IMPL_VSX_INITVEC(v_uint64x2, uint64, u64, vec_udword2)
274OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2)
275OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4)
276OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2)
277
278#define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a) \
279inline _Tpvec v_load(const _Tp* ptr) \
280{ return _Tpvec(ld(0, ptr)); } \
281inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr)) \
282{ return _Tpvec(ld_a(0, ptr)); } \
283inline _Tpvec v_load_low(const _Tp* ptr) \
284{ return _Tpvec(vec_ld_l8(ptr)); } \
285inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
286{ return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \
287inline void v_store(_Tp* ptr, const _Tpvec& a) \
288{ st(a.val, 0, ptr); } \
289inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
290{ st_a(a.val, 0, ptr); } \
291inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
292{ st_a(a.val, 0, ptr); } \
293inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
294{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
295inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
296{ vec_st_l8(a.val, ptr); } \
297inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
298{ vec_st_h8(a.val, ptr); }
299
300// working around gcc bug for aligned ld/st
301// if runtime check for vec_ld/st fail we failback to unaligned ld/st
302// https://github.com/opencv/opencv/issues/13211
303#ifdef CV_COMPILER_VSX_BROKEN_ALIGNED
304 #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
305 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vsx_ld, vsx_st, vsx_st)
306#else
307 #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
308 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
309#endif
310
311OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint8x16, uchar)
312OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int8x16, schar)
313OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint16x8, ushort)
314OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int16x8, short)
315OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint32x4, uint)
316OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int32x4, int)
317OPENCV_HAL_IMPL_VSX_LOADSTORE(v_float32x4, float)
318
319OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_float64x2, double, vsx_ld, vsx_ld, vsx_st, vsx_st)
320OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_uint64x2, uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
321OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_int64x2, int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
322
323
324
325/* de&interleave */
326#define OPENCV_HAL_IMPL_VSX_INTERLEAVE(_Tp, _Tpvec) \
327inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
328{ vec_ld_deinterleave(ptr, a.val, b.val);} \
329inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \
330 _Tpvec& b, _Tpvec& c) \
331{ vec_ld_deinterleave(ptr, a.val, b.val, c.val); } \
332inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \
333 _Tpvec& c, _Tpvec& d) \
334{ vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \
335inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
336 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
337{ vec_st_interleave(a.val, b.val, ptr); } \
338inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \
339 const _Tpvec& b, const _Tpvec& c, \
340 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
341{ vec_st_interleave(a.val, b.val, c.val, ptr); } \
342inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
343 const _Tpvec& c, const _Tpvec& d, \
344 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
345{ vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
346
347OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
348OPENCV_HAL_IMPL_VSX_INTERLEAVE(schar, v_int8x16)
349OPENCV_HAL_IMPL_VSX_INTERLEAVE(ushort, v_uint16x8)
350OPENCV_HAL_IMPL_VSX_INTERLEAVE(short, v_int16x8)
351OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
352OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
353OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
354OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
355OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2)
356OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2)
357
358/* Expand */
359#define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \
360inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
361{ \
362 b0.val = fh(a.val); \
363 b1.val = fl(a.val); \
364} \
365inline _Tpwvec v_expand_low(const _Tpvec& a) \
366{ return _Tpwvec(fh(a.val)); } \
367inline _Tpwvec v_expand_high(const _Tpvec& a) \
368{ return _Tpwvec(fl(a.val)); } \
369inline _Tpwvec v_load_expand(const _Tp* ptr) \
370{ return _Tpwvec(fh(vec_ld_l8(ptr))); }
371
372OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu)
373OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh)
374OPENCV_HAL_IMPL_VSX_EXPAND(v_uint16x8, v_uint32x4, ushort, vec_unpacklu, vec_unpackhu)
375OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh)
376OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
377OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
378
379/* Load and zero expand a 4 byte value into the second dword, first is don't care. */
380#if !defined(CV_COMPILER_VSX_BROKEN_ASM)
381 #define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory");
382#else
383 /* This is compiler-agnostic, but will introduce an unneeded splat on the critical path. */
384 #define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr));
385#endif
386
387inline v_uint32x4 v_load_expand_q(const uchar* ptr)
388{
389 // Zero-extend the extra 24B instead of unpacking. Usually faster in small kernel
390 // Likewise note, value is zero extended and upper 4 bytes are zero'ed.
391 vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12};
392 vec_uchar16 out;
393
394 _LXSIWZX(out, ptr, vec_uchar16);
395 out = vec_perm(out, out, pmu);
396 return v_uint32x4((vec_uint4)out);
397}
398
399inline v_int32x4 v_load_expand_q(const schar* ptr)
400{
401 vec_char16 out;
402 vec_short8 outs;
403 vec_int4 outw;
404
405 _LXSIWZX(out, ptr, vec_char16);
406 outs = vec_unpackl(out);
407 outw = vec_unpackh(outs);
408 return v_int32x4(outw);
409}
410
411/* pack */
412#define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack) \
413inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
414{ \
415 return _Tpvec(pkfnc(a.val, b.val)); \
416} \
417inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
418{ \
419 vec_st_l8(pkfnc(a.val, a.val), ptr); \
420} \
421template<int n> \
422inline _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
423{ \
424 const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
425 const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
426 return _Tpvec(pkfnc(sfnc(addfnc(a.val, delta), vn), sfnc(addfnc(b.val, delta), vn))); \
427} \
428template<int n> \
429inline void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
430{ \
431 const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
432 const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
433 vec_st_l8(pkfnc(sfnc(addfnc(a.val, delta), vn), delta), ptr); \
434}
435
436OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_uint16x8, unsigned short, unsigned short,
437 vec_sr, vec_packs, vec_adds, pack)
438OPENCV_HAL_IMPL_VSX_PACK(v_int8x16, schar, v_int16x8, unsigned short, short,
439 vec_sra, vec_packs, vec_adds, pack)
440
441OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_uint32x4, unsigned int, unsigned int,
442 vec_sr, vec_packs, vec_add, pack)
443OPENCV_HAL_IMPL_VSX_PACK(v_int16x8, short, v_int32x4, unsigned int, int,
444 vec_sra, vec_packs, vec_add, pack)
445
446OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_uint64x2, unsigned long long, unsigned long long,
447 vec_sr, vec_pack, vec_add, pack)
448OPENCV_HAL_IMPL_VSX_PACK(v_int32x4, int, v_int64x2, unsigned long long, long long,
449 vec_sra, vec_pack, vec_add, pack)
450
451OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_int16x8, unsigned short, short,
452 vec_sra, vec_packsu, vec_adds, pack_u)
453OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
454 vec_sra, vec_packsu, vec_add, pack_u)
455// Following variant is not implemented on other platforms:
456//OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
457// vec_sra, vec_packsu, vec_add, pack_u)
458
459// pack boolean
460inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
461{
462 vec_uchar16 ab = vec_pack(a.val, b.val);
463 return v_uint8x16(ab);
464}
465
466inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
467 const v_uint32x4& c, const v_uint32x4& d)
468{
469 vec_ushort8 ab = vec_pack(a.val, b.val);
470 vec_ushort8 cd = vec_pack(c.val, d.val);
471 return v_uint8x16(vec_pack(ab, cd));
472}
473
474inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
475 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
476 const v_uint64x2& g, const v_uint64x2& h)
477{
478 vec_uint4 ab = vec_pack(a.val, b.val);
479 vec_uint4 cd = vec_pack(c.val, d.val);
480 vec_uint4 ef = vec_pack(e.val, f.val);
481 vec_uint4 gh = vec_pack(g.val, h.val);
482
483 vec_ushort8 abcd = vec_pack(ab, cd);
484 vec_ushort8 efgh = vec_pack(ef, gh);
485 return v_uint8x16(vec_pack(abcd, efgh));
486}
487
488/* Recombine */
489template <typename _Tpvec>
490inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
491{
492 b0.val = vec_mergeh(a0.val, a1.val);
493 b1.val = vec_mergel(a0.val, a1.val);
494}
495
496template <typename _Tpvec>
497inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)
498{ return _Tpvec(vec_mergesql(a.val, b.val)); }
499
500template <typename _Tpvec>
501inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)
502{ return _Tpvec(vec_mergesqh(a.val, b.val)); }
503
504template <typename _Tpvec>
505inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
506{
507 c.val = vec_mergesqh(a.val, b.val);
508 d.val = vec_mergesql(a.val, b.val);
509}
510
512
513/* Element-wise binary and unary operations */
515#define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \
516inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
517{ return _Tpvec(intrin(a.val, b.val)); } \
518inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
519{ a.val = intrin(a.val, b.val); return a; }
520
521OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
522OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
523OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
524OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
525OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
526OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
527OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
528OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
529OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
530OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
531OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
532OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
533OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
534OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
535OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
536OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
537OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
538OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
539OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
540OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
541OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
542OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
543OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
544OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
545OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
546OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
547
548// saturating multiply
549#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
550 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
551 { \
552 _Tpwvec c, d; \
553 v_mul_expand(a, b, c, d); \
554 return v_pack(c, d); \
555 } \
556 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
557 { a = a * b; return a; }
558
559OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16, v_int16x8)
560OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
561OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int16x8, v_int32x4)
562OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint16x8, v_uint32x4)
563
564template<typename Tvec, typename Twvec>
565inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
566{
567 Twvec p0 = Twvec(vec_mule(a.val, b.val));
568 Twvec p1 = Twvec(vec_mulo(a.val, b.val));
569 v_zip(p0, p1, c, d);
570}
571
572inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
573{
574 vec_int4 p0 = vec_mule(a.val, b.val);
575 vec_int4 p1 = vec_mulo(a.val, b.val);
576 static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
577 return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm));
578}
579inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
580{
581 vec_uint4 p0 = vec_mule(a.val, b.val);
582 vec_uint4 p1 = vec_mulo(a.val, b.val);
583 static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
584 return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm));
585}
586
588#define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin) \
589template<typename _Tpvec> \
590inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
591{ return _Tpvec(intrin(a.val, b.val)); }
592
593OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
594OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
595OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
596
597
598#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
599inline _Tpvec operator << (const _Tpvec& a, int imm) \
600{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
601inline _Tpvec operator >> (const _Tpvec& a, int imm) \
602{ return _Tpvec(shr(a.val, splfunc(imm))); } \
603template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
604{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
605template<int imm> inline _Tpvec v_shr(const _Tpvec& a) \
606{ return _Tpvec(shr(a.val, splfunc(imm))); }
607
608OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
609OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
610OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
611OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
612// algebraic right shift
613OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
614OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
615OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
616OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
617
618
619#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
620OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \
621OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \
622OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \
623inline _Tpvec operator ~ (const _Tpvec& a) \
624{ return _Tpvec(vec_not(a.val)); }
625
626OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
627OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int8x16)
628OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint16x8)
629OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int16x8)
630OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint32x4)
631OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int32x4)
632OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint64x2)
633OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int64x2)
634OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float32x4)
635OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float64x2)
636
637
638#define OPENCV_HAL_IMPL_VSX_SELECT(_Tpvec, cast) \
639inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
640{ return _Tpvec(vec_sel(b.val, a.val, cast(mask.val))); }
641
642OPENCV_HAL_IMPL_VSX_SELECT(v_uint8x16, vec_bchar16_c)
643OPENCV_HAL_IMPL_VSX_SELECT(v_int8x16, vec_bchar16_c)
644OPENCV_HAL_IMPL_VSX_SELECT(v_uint16x8, vec_bshort8_c)
645OPENCV_HAL_IMPL_VSX_SELECT(v_int16x8, vec_bshort8_c)
646OPENCV_HAL_IMPL_VSX_SELECT(v_uint32x4, vec_bint4_c)
647OPENCV_HAL_IMPL_VSX_SELECT(v_int32x4, vec_bint4_c)
648OPENCV_HAL_IMPL_VSX_SELECT(v_float32x4, vec_bint4_c)
649OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
650
651
652#define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \
653inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
654{ return _Tpvec(vec_cmpeq(a.val, b.val)); } \
655inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
656{ return _Tpvec(vec_cmpne(a.val, b.val)); } \
657inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
658{ return _Tpvec(vec_cmplt(a.val, b.val)); } \
659inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
660{ return _Tpvec(vec_cmpgt(a.val, b.val)); } \
661inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
662{ return _Tpvec(vec_cmple(a.val, b.val)); } \
663inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
664{ return _Tpvec(vec_cmpge(a.val, b.val)); }
665
666OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
667OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int8x16)
668OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint16x8)
669OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int16x8)
670OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint32x4)
671OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int32x4)
672OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float32x4)
673OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
674OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
675OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
676
677inline v_float32x4 v_not_nan(const v_float32x4& a)
678{ return v_float32x4(vec_cmpeq(a.val, a.val)); }
679inline v_float64x2 v_not_nan(const v_float64x2& a)
680{ return v_float64x2(vec_cmpeq(a.val, a.val)); }
681
683OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
684OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
685
686
687#define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \
688template<int imm> \
689inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
690{ \
691 const int wd = imm * sizeof(typename _Tpvec::lane_type); \
692 if (wd > 15) \
693 return _Tpvec::zero(); \
694 return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
695}
696
697#define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \
698OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
699OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
700
701OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
702OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16, vec_char16)
703OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
704OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8)
705OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
706OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4)
707OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
708OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
709OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2)
710OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)
711
712template<int imm, typename _Tpvec>
713inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
714{
715 enum { CV_SHIFT = 16 - imm * (sizeof(typename _Tpvec::lane_type)) };
716 if (CV_SHIFT == 16)
717 return a;
718#ifdef __IBMCPP__
719 return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT & 15));
720#else
721 return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT));
722#endif
723}
724
725template<int imm, typename _Tpvec>
726inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
727{
728 enum { CV_SHIFT = imm * (sizeof(typename _Tpvec::lane_type)) };
729 if (CV_SHIFT == 16)
730 return b;
731 return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
732}
733
734#define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2) \
735template<int imm> \
736inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
737{ \
738 if (imm == 1) \
739 return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \
740 return imm ? b : a; \
741}
742
743#define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec) \
744OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left, b, a) \
745OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)
746
747OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
748OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
749OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
750
751/* Reverse */
752inline v_uint8x16 v_reverse(const v_uint8x16 &a)
753{
754 static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
755 vec_uchar16 vec = (vec_uchar16)a.val;
756 return v_uint8x16(vec_perm(vec, vec, perm));
757}
758
759inline v_int8x16 v_reverse(const v_int8x16 &a)
760{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
761
762inline v_uint16x8 v_reverse(const v_uint16x8 &a)
763{
764 static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
765 vec_uchar16 vec = (vec_uchar16)a.val;
766 return v_reinterpret_as_u16(v_uint8x16(vec_perm(vec, vec, perm)));
767}
768
769inline v_int16x8 v_reverse(const v_int16x8 &a)
770{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
771
772inline v_uint32x4 v_reverse(const v_uint32x4 &a)
773{
774 static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
775 vec_uchar16 vec = (vec_uchar16)a.val;
776 return v_reinterpret_as_u32(v_uint8x16(vec_perm(vec, vec, perm)));
777}
778
779inline v_int32x4 v_reverse(const v_int32x4 &a)
780{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
781
782inline v_float32x4 v_reverse(const v_float32x4 &a)
783{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
784
785inline v_uint64x2 v_reverse(const v_uint64x2 &a)
786{
787 static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
788 vec_uchar16 vec = (vec_uchar16)a.val;
789 return v_reinterpret_as_u64(v_uint8x16(vec_perm(vec, vec, perm)));
790}
791
792inline v_int64x2 v_reverse(const v_int64x2 &a)
793{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
794
795inline v_float64x2 v_reverse(const v_float64x2 &a)
796{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
797
798/* Extract */
799template<int s, typename _Tpvec>
800inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
801{ return v_rotate_right<s>(a, b); }
802
804
806inline uint v_reduce_sum(const v_uint8x16& a)
807{
808 const vec_uint4 zero4 = vec_uint4_z;
809 vec_uint4 sum4 = vec_sum4s(a.val, zero4);
810 return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
811}
812inline int v_reduce_sum(const v_int8x16& a)
813{
814 const vec_int4 zero4 = vec_int4_z;
815 vec_int4 sum4 = vec_sum4s(a.val, zero4);
816 return (int)vec_extract(vec_sums(sum4, zero4), 3);
817}
818inline int v_reduce_sum(const v_int16x8& a)
819{
820 const vec_int4 zero = vec_int4_z;
821 return saturate_cast<int>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
822}
823inline uint v_reduce_sum(const v_uint16x8& a)
824{
825 const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
826 return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
827}
828
829#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
830inline scalartype v_reduce_##suffix(const _Tpvec& a) \
831{ \
832 const _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
833 return vec_extract(func(rs, vec_sld(rs, rs, 4)), 0); \
834}
835OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, sum, vec_add)
836OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, max, vec_max)
837OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, min, vec_min)
838OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, sum, vec_add)
839OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, max, vec_max)
840OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, min, vec_min)
841OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
842OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
843OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)
844
845inline uint64 v_reduce_sum(const v_uint64x2& a)
846{
847 return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
848}
849inline int64 v_reduce_sum(const v_int64x2& a)
850{
851 return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
852}
853inline double v_reduce_sum(const v_float64x2& a)
854{
855 return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
856}
857
858#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \
859inline scalartype v_reduce_##suffix(const _Tpvec& a) \
860{ \
861 _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
862 rs = func(rs, vec_sld(rs, rs, 4)); \
863 return vec_extract(func(rs, vec_sld(rs, rs, 2)), 0); \
864}
865OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, max, vec_max)
866OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
867OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
868OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)
869
870#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
871inline scalartype v_reduce_##suffix(const _Tpvec& a) \
872{ \
873 _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
874 rs = func(rs, vec_sld(rs, rs, 4)); \
875 rs = func(rs, vec_sld(rs, rs, 2)); \
876 return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0); \
877}
878OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, max, vec_max)
879OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, min, vec_min)
880OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, max, vec_max)
881OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, min, vec_min)
882
883inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
884 const v_float32x4& c, const v_float32x4& d)
885{
886 vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val));
887 ac = vec_add(ac, vec_sld(ac, ac, 8));
888
889 vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val));
890 bd = vec_add(bd, vec_sld(bd, bd, 8));
891 return v_float32x4(vec_mergeh(ac, bd));
892}
893
894inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
895{
896 const vec_uint4 zero4 = vec_uint4_z;
897 vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
898 return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
899}
900inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
901{
902 const vec_int4 zero4 = vec_int4_z;
903 vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
904 vec_int4 sum4 = vec_sum4s(ad, zero4);
905 return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
906}
907inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
908{
909 vec_ushort8 ad = vec_absd(a.val, b.val);
910 VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)) + vec_int4_c(vec_unpacklu(ad)), vec_int4_z);
911 return (unsigned)vec_extract(sum, 3);
912}
913inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
914{
915 const vec_int4 zero4 = vec_int4_z;
916 vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
917 vec_int4 sum4 = vec_sum4s(ad, zero4);
918 return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
919}
920inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
921{
922 const vec_uint4 ad = vec_absd(a.val, b.val);
923 const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
924 return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
925}
926inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
927{
928 vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
929 return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
930}
931inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
932{
933 const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
934 const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
935 return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
936}
937
939inline v_uint8x16 v_popcount(const v_uint8x16& a)
940{ return v_uint8x16(vec_popcntu(a.val)); }
941inline v_uint8x16 v_popcount(const v_int8x16& a)
942{ return v_uint8x16(vec_popcntu(a.val)); }
943inline v_uint16x8 v_popcount(const v_uint16x8& a)
944{ return v_uint16x8(vec_popcntu(a.val)); }
945inline v_uint16x8 v_popcount(const v_int16x8& a)
946{ return v_uint16x8(vec_popcntu(a.val)); }
947inline v_uint32x4 v_popcount(const v_uint32x4& a)
948{ return v_uint32x4(vec_popcntu(a.val)); }
949inline v_uint32x4 v_popcount(const v_int32x4& a)
950{ return v_uint32x4(vec_popcntu(a.val)); }
951inline v_uint64x2 v_popcount(const v_uint64x2& a)
952{ return v_uint64x2(vec_popcntu(a.val)); }
953inline v_uint64x2 v_popcount(const v_int64x2& a)
954{ return v_uint64x2(vec_popcntu(a.val)); }
955
957inline int v_signmask(const v_uint8x16& a)
958{
959 static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
960 return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
961}
962inline int v_signmask(const v_int8x16& a)
963{ return v_signmask(v_reinterpret_as_u8(a)); }
964
965inline int v_signmask(const v_int16x8& a)
966{
967 static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
968 return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
969}
970inline int v_signmask(const v_uint16x8& a)
971{ return v_signmask(v_reinterpret_as_s16(a)); }
972
973inline int v_signmask(const v_int32x4& a)
974{
975 static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
976 return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
977}
978inline int v_signmask(const v_uint32x4& a)
979{ return v_signmask(v_reinterpret_as_s32(a)); }
980inline int v_signmask(const v_float32x4& a)
981{ return v_signmask(v_reinterpret_as_s32(a)); }
982
983inline int v_signmask(const v_int64x2& a)
984{
985 VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
986 return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
987}
988inline int v_signmask(const v_uint64x2& a)
989{ return v_signmask(v_reinterpret_as_s64(a)); }
990inline int v_signmask(const v_float64x2& a)
991{ return v_signmask(v_reinterpret_as_s64(a)); }
992
993inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
994inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
995inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
996inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
997inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
998inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
999inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
1000inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
1001inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
1002inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
1003
1004template<typename _Tpvec>
1005inline bool v_check_all(const _Tpvec& a)
1006{ return vec_all_lt(a.val, _Tpvec::zero().val); }
1007inline bool v_check_all(const v_uint8x16& a)
1008{ return v_check_all(v_reinterpret_as_s8(a)); }
1009inline bool v_check_all(const v_uint16x8& a)
1010{ return v_check_all(v_reinterpret_as_s16(a)); }
1011inline bool v_check_all(const v_uint32x4& a)
1012{ return v_check_all(v_reinterpret_as_s32(a)); }
1013inline bool v_check_all(const v_uint64x2& a)
1014{ return v_check_all(v_reinterpret_as_s64(a)); }
1015inline bool v_check_all(const v_float32x4& a)
1016{ return v_check_all(v_reinterpret_as_s32(a)); }
1017inline bool v_check_all(const v_float64x2& a)
1018{ return v_check_all(v_reinterpret_as_s64(a)); }
1019
1020template<typename _Tpvec>
1021inline bool v_check_any(const _Tpvec& a)
1022{ return vec_any_lt(a.val, _Tpvec::zero().val); }
1023inline bool v_check_any(const v_uint8x16& a)
1024{ return v_check_any(v_reinterpret_as_s8(a)); }
1025inline bool v_check_any(const v_uint16x8& a)
1026{ return v_check_any(v_reinterpret_as_s16(a)); }
1027inline bool v_check_any(const v_uint32x4& a)
1028{ return v_check_any(v_reinterpret_as_s32(a)); }
1029inline bool v_check_any(const v_uint64x2& a)
1030{ return v_check_any(v_reinterpret_as_s64(a)); }
1031inline bool v_check_any(const v_float32x4& a)
1032{ return v_check_any(v_reinterpret_as_s32(a)); }
1033inline bool v_check_any(const v_float64x2& a)
1034{ return v_check_any(v_reinterpret_as_s64(a)); }
1035
1037
1039inline v_float32x4 v_sqrt(const v_float32x4& x)
1040{ return v_float32x4(vec_sqrt(x.val)); }
1041inline v_float64x2 v_sqrt(const v_float64x2& x)
1042{ return v_float64x2(vec_sqrt(x.val)); }
1043
1044inline v_float32x4 v_invsqrt(const v_float32x4& x)
1045{ return v_float32x4(vec_rsqrt(x.val)); }
1046inline v_float64x2 v_invsqrt(const v_float64x2& x)
1047{ return v_float64x2(vec_rsqrt(x.val)); }
1048
1049#define OPENCV_HAL_IMPL_VSX_MULADD(_Tpvec) \
1050inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1051{ return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
1052inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1053{ return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \
1054inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1055{ return _Tpvec(vec_madd(a.val, b.val, c.val)); } \
1056inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1057{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }
1058
1059OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
1060OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
1061
1062inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1063{ return a * b + c; }
1064
1065// TODO: exp, log, sin, cos
1066
1068inline v_uint8x16 v_abs(const v_int8x16& x)
1069{ return v_uint8x16(vec_uchar16_c(vec_abs(x.val))); }
1070
1071inline v_uint16x8 v_abs(const v_int16x8& x)
1072{ return v_uint16x8(vec_ushort8_c(vec_abs(x.val))); }
1073
1074inline v_uint32x4 v_abs(const v_int32x4& x)
1075{ return v_uint32x4(vec_uint4_c(vec_abs(x.val))); }
1076
1077inline v_float32x4 v_abs(const v_float32x4& x)
1078{ return v_float32x4(vec_abs(x.val)); }
1079
1080inline v_float64x2 v_abs(const v_float64x2& x)
1081{ return v_float64x2(vec_abs(x.val)); }
1082
1084// unsigned
1085OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
1086
1087inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1088{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1089inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1090{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1091inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1092{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
1093
1094inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
1095{ return v_abs(a - b); }
1096inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
1097{ return v_abs(a - b); }
1098
1100inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1101{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
1102inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1103{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }
1104
1106
1108inline v_int32x4 v_round(const v_float32x4& a)
1109{ return v_int32x4(vec_cts(vec_rint(a.val))); }
1110
1111inline v_int32x4 v_round(const v_float64x2& a)
1112{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
1113
1114inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1115{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
1116
1117inline v_int32x4 v_floor(const v_float32x4& a)
1118{ return v_int32x4(vec_cts(vec_floor(a.val))); }
1119
1120inline v_int32x4 v_floor(const v_float64x2& a)
1121{ return v_int32x4(vec_mergesqo(vec_ctso(vec_floor(a.val)), vec_int4_z)); }
1122
1123inline v_int32x4 v_ceil(const v_float32x4& a)
1124{ return v_int32x4(vec_cts(vec_ceil(a.val))); }
1125
1126inline v_int32x4 v_ceil(const v_float64x2& a)
1127{ return v_int32x4(vec_mergesqo(vec_ctso(vec_ceil(a.val)), vec_int4_z)); }
1128
1129inline v_int32x4 v_trunc(const v_float32x4& a)
1130{ return v_int32x4(vec_cts(a.val)); }
1131
1132inline v_int32x4 v_trunc(const v_float64x2& a)
1133{ return v_int32x4(vec_mergesqo(vec_ctso(a.val), vec_int4_z)); }
1134
1136inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1137{ return v_float32x4(vec_ctf(a.val)); }
1138
1139inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1140{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
1141
1142inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1143{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
1144
1145inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1146{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
1147
1148inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1149{ return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
1150
1151inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1152{ return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
1153
1154inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1155{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
1156
1157inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1158{ return v_float64x2(vec_ctd(a.val)); }
1159
1161
1162inline v_int8x16 v_lut(const schar* tab, const int* idx)
1163{
1164 return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]],
1165 tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
1166}
1167inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1168{
1169 return v_reinterpret_as_s8(v_int16x8(*(const short*)(tab+idx[0]), *(const short*)(tab+idx[1]), *(const short*)(tab+idx[2]), *(const short*)(tab+idx[3]),
1170 *(const short*)(tab+idx[4]), *(const short*)(tab+idx[5]), *(const short*)(tab+idx[6]), *(const short*)(tab+idx[7])));
1171}
1172inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1173{
1174 return v_reinterpret_as_s8(v_int32x4(*(const int*)(tab+idx[0]), *(const int*)(tab+idx[1]), *(const int*)(tab+idx[2]), *(const int*)(tab+idx[3])));
1175}
1176inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
1177inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
1178inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
1179
1180inline v_int16x8 v_lut(const short* tab, const int* idx)
1181{
1182 return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
1183}
1184inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1185{
1186 return v_reinterpret_as_s16(v_int32x4(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
1187}
1188inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1189{
1190 return v_reinterpret_as_s16(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
1191}
1192inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short*)tab, idx)); }
1193inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short*)tab, idx)); }
1194inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short*)tab, idx)); }
1195
1196inline v_int32x4 v_lut(const int* tab, const int* idx)
1197{
1198 return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1199}
1200inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1201{
1202 return v_reinterpret_as_s32(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
1203}
1204inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1205{
1206 return v_int32x4(vsx_ld(0, tab + idx[0]));
1207}
1208inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int*)tab, idx)); }
1209inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int*)tab, idx)); }
1210inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int*)tab, idx)); }
1211
1212inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1213{
1214 return v_int64x2(tab[idx[0]], tab[idx[1]]);
1215}
1216inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1217{
1218 return v_int64x2(vsx_ld2(0, tab + idx[0]));
1219}
1220inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1221inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1222
1223inline v_float32x4 v_lut(const float* tab, const int* idx)
1224{
1225 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1226}
1227inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int*)tab, idx)); }
1228inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_load(tab + *idx); }
1229
1230inline v_float64x2 v_lut(const double* tab, const int* idx)
1231{
1232 return v_float64x2(tab[idx[0]], tab[idx[1]]);
1233}
1234inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_load(tab + *idx); }
1235
1236inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1237{
1238 const int idx[4] = {
1239 vec_extract(idxvec.val, 0),
1240 vec_extract(idxvec.val, 1),
1241 vec_extract(idxvec.val, 2),
1242 vec_extract(idxvec.val, 3)
1243 };
1244 return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1245}
1246
1247inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1248{
1249 const int idx[4] = {
1250 vec_extract(idxvec.val, 0),
1251 vec_extract(idxvec.val, 1),
1252 vec_extract(idxvec.val, 2),
1253 vec_extract(idxvec.val, 3)
1254 };
1255 return v_uint32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1256}
1257
1258inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1259{
1260 const int idx[4] = {
1261 vec_extract(idxvec.val, 0),
1262 vec_extract(idxvec.val, 1),
1263 vec_extract(idxvec.val, 2),
1264 vec_extract(idxvec.val, 3)
1265 };
1266 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1267}
1268
1269inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1270{
1271 const int idx[2] = {
1272 vec_extract(idxvec.val, 0),
1273 vec_extract(idxvec.val, 1)
1274 };
1275 return v_float64x2(tab[idx[0]], tab[idx[1]]);
1276}
1277
1278inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1279{
1280 vec_float4 xy0 = vec_ld_l8(tab + vec_extract(idxvec.val, 0));
1281 vec_float4 xy1 = vec_ld_l8(tab + vec_extract(idxvec.val, 1));
1282 vec_float4 xy2 = vec_ld_l8(tab + vec_extract(idxvec.val, 2));
1283 vec_float4 xy3 = vec_ld_l8(tab + vec_extract(idxvec.val, 3));
1284 vec_float4 xy02 = vec_mergeh(xy0, xy2); // x0, x2, y0, y2
1285 vec_float4 xy13 = vec_mergeh(xy1, xy3); // x1, x3, y1, y3
1286 x.val = vec_mergeh(xy02, xy13);
1287 y.val = vec_mergel(xy02, xy13);
1288}
1289inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1290{
1291 vec_double2 xy0 = vsx_ld(vec_extract(idxvec.val, 0), tab);
1292 vec_double2 xy1 = vsx_ld(vec_extract(idxvec.val, 1), tab);
1293 x.val = vec_mergeh(xy0, xy1);
1294 y.val = vec_mergel(xy0, xy1);
1295}
1296
1297inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
1298{
1299 static const vec_uchar16 perm = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15};
1300 return v_int8x16(vec_perm(vec.val, vec.val, perm));
1301}
1302inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
1303{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
1304
1305inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
1306{
1307 static const vec_uchar16 perm = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
1308 return v_int8x16(vec_perm(vec.val, vec.val, perm));
1309}
1310inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
1311{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
1312
1313inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
1314{
1315 static const vec_uchar16 perm = {0,1, 4,5, 2,3, 6,7, 8,9, 12,13, 10,11, 14,15};
1316 return v_int16x8(vec_perm(vec.val, vec.val, perm));
1317}
1318inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
1319{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
1320
1321inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
1322{
1323 static const vec_uchar16 perm = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
1324 return v_int16x8(vec_perm(vec.val, vec.val, perm));
1325}
1326inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
1327{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
1328
1329inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
1330{
1331 static const vec_uchar16 perm = {0,1,2,3, 8,9,10,11, 4,5,6,7, 12,13,14,15};
1332 return v_int32x4(vec_perm(vec.val, vec.val, perm));
1333}
1334inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
1335{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1337{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
1338
1339inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
1340{
1341 static const vec_uchar16 perm = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 15, 15, 15};
1342 return v_int8x16(vec_perm(vec.val, vec.val, perm));
1343}
1344inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
1345{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
1346
1347inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
1348{
1349 static const vec_uchar16 perm = {0,1, 2,3, 4,5, 8,9, 10,11, 12,13, 14,15, 14,15};
1350 return v_int16x8(vec_perm(vec.val, vec.val, perm));
1351}
1352inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
1353{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
1354
1355inline v_int32x4 v_pack_triplets(const v_int32x4& vec)
1356{ return vec; }
1357inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec)
1358{ return vec; }
1359inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
1360{ return vec; }
1361
1363
1364inline v_float32x4 v_load_expand(const hfloat* ptr)
1365{
1366 vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
1367#if CV_VSX3 && defined(vec_extract_fp_from_shorth)
1368 return v_float32x4(vec_extract_fp_from_shorth(vf16));
1369#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
1370 vec_float4 vf32;
1371 __asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wa" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
1372 return v_float32x4(vf32);
1373#else
1374 const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
1375 const vec_int4 signmask = vec_int4_sp(0x80000000);
1376 const vec_int4 maxexp = vec_int4_sp(0x7c000000);
1377 const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
1378
1379 vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
1380 vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
1381 vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta
1382 vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
1383
1384 t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e)));
1385 vec_bint4 zmask = vec_cmpeq(e, z);
1386 vec_int4 ft = vec_sel(t, zt, zmask);
1387 return v_float32x4(vec_float4_c(vec_or(ft, sign)));
1388#endif
1389}
1390
1391inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
1392{
1393// fixme: Is there any builtin op or intrinsic that cover "xvcvsphp"?
1394#if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
1395 vec_ushort8 vf16;
1396 __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (v.val));
1397 vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
1398#else
1399 const vec_int4 signmask = vec_int4_sp(0x80000000);
1400 const vec_int4 rval = vec_int4_sp(0x3f000000);
1401
1402 vec_int4 t = vec_int4_c(v.val);
1403 vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
1404 t = vec_and(vec_nor(signmask, signmask), t);
1405
1406 vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
1407 vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
1408 vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
1409 vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
1410 vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
1411 tt = vec_sub(tt, rval);
1412 vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
1413 vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
1414 nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
1415 t = vec_sel(nt, tt, tinymask);
1416 t = vec_sel(naninf, t, finitemask);
1417 t = vec_or(t, sign);
1418 vec_st_l8(vec_packs(t, t), ptr);
1419#endif
1420}
1421
1422inline void v_cleanup() {}
1423
1424
1429
1431// 16 >> 32
1432inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
1433{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
1434inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1435{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
1436
1437// 32 >> 64
1438inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
1439{
1440 vec_dword2 even = vec_mule(a.val, b.val);
1441 vec_dword2 odd = vec_mulo(a.val, b.val);
1442 return v_int64x2(vec_add(even, odd));
1443}
1444inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1445{ return v_dotprod(a, b) + c; }
1446
1447// 8 >> 32
1448inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1449{ return v_uint32x4(vec_msum(a.val, b.val, c.val)); }
1450inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
1451{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); }
1452
1453inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
1454{
1455 const vec_ushort8 eight = vec_ushort8_sp(8);
1456 vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
1457 vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
1458 vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
1459 vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
1460 return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
1461}
1462
1463inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1464{
1465 const vec_ushort8 eight = vec_ushort8_sp(8);
1466 vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
1467 vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
1468 vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
1469 vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
1470 return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val)));
1471}
1472
1473// 16 >> 64
1474inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
1475{
1476 const vec_uint4 zero = vec_uint4_z;
1477 vec_uint4 even = vec_mule(a.val, b.val);
1478 vec_uint4 odd = vec_mulo(a.val, b.val);
1479 vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero);
1480 vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero);
1481 vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero);
1482 vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero);
1483 vec_udword2 s0 = vec_add(e0, o0);
1484 vec_udword2 s1 = vec_add(e1, o1);
1485 return v_uint64x2(vec_add(s0, s1));
1486}
1487inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1488{ return v_dotprod_expand(a, b) + c; }
1489
1490inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
1491{
1492 v_int32x4 prod = v_dotprod(a, b);
1493 v_int64x2 c, d;
1494 v_expand(prod, c, d);
1495 return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
1496}
1497inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1498{ return v_dotprod_expand(a, b) + c; }
1499
1500// 32 >> 64f
1501inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
1502{ return v_cvt_f64(v_dotprod(a, b)); }
1503inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1504{ return v_dotprod_expand(a, b) + c; }
1505
1507
1508// 16 >> 32
1509inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
1510{ return v_dotprod(a, b); }
1511inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1512{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
1513// 32 >> 64
1514inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
1515{ return v_dotprod(a, b); }
1516inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1517{ return v_dotprod(a, b, c); }
1518
1519// 8 >> 32
1520inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1521{ return v_dotprod_expand(a, b); }
1522inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1523{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
1524
1525inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1526{
1527 vec_short8 a0 = vec_unpackh(a.val);
1528 vec_short8 a1 = vec_unpackl(a.val);
1529 vec_short8 b0 = vec_unpackh(b.val);
1530 vec_short8 b1 = vec_unpackl(b.val);
1531 return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
1532}
1533inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1534{ return v_dotprod_expand_fast(a, b) + c; }
1535
1536// 16 >> 64
1537inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1538{ return v_dotprod_expand(a, b); }
1539inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1540{ return v_dotprod_expand(a, b, c); }
1541
1542inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1543{
1544 v_int32x4 prod = v_dotprod(a, b);
1545 v_int64x2 c, d;
1546 v_expand(prod, c, d);
1547 return c + d;
1548}
1549inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1550{ return v_dotprod_expand_fast(a, b) + c; }
1551
1552// 32 >> 64f
1553inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1554{ return v_dotprod_expand(a, b); }
1555inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1556{ return v_dotprod_expand(a, b, c); }
1557
1558inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
1559 const v_float32x4& m1, const v_float32x4& m2,
1560 const v_float32x4& m3)
1561{
1562 const vec_float4 v0 = vec_splat(v.val, 0);
1563 const vec_float4 v1 = vec_splat(v.val, 1);
1564 const vec_float4 v2 = vec_splat(v.val, 2);
1565 VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
1566 return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
1567}
1568
1569inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
1570 const v_float32x4& m1, const v_float32x4& m2,
1571 const v_float32x4& a)
1572{
1573 const vec_float4 v0 = vec_splat(v.val, 0);
1574 const vec_float4 v1 = vec_splat(v.val, 1);
1575 const vec_float4 v2 = vec_splat(v.val, 2);
1576 return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
1577}
1578
1579#define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
1580inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1581 const _Tpvec& a2, const _Tpvec& a3, \
1582 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1583{ \
1584 _Tpvec2 a02 = vec_mergeh(a0.val, a2.val); \
1585 _Tpvec2 a13 = vec_mergeh(a1.val, a3.val); \
1586 b0.val = vec_mergeh(a02, a13); \
1587 b1.val = vec_mergel(a02, a13); \
1588 a02 = vec_mergel(a0.val, a2.val); \
1589 a13 = vec_mergel(a1.val, a3.val); \
1590 b2.val = vec_mergeh(a02, a13); \
1591 b3.val = vec_mergel(a02, a13); \
1592}
1593OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4)
1594OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4)
1595OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4)
1596
1597template<int i, typename Tvec>
1598inline Tvec v_broadcast_element(const Tvec& v)
1599{ return Tvec(vec_splat(v.val, i)); }
1600
1601
1602CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
1603
1605
1606}
1607
1608#endif // OPENCV_HAL_VSX_HPP
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433
v_reg< _Tp, n > v_combine_high(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from last elements of two vectors.
Definition intrin_cpp.hpp:2304
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition intrin_cpp.hpp:1554
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
void v_recombine(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< _Tp, n > &low, v_reg< _Tp, n > &high)
Combine two vectors from lower and higher parts of two other vectors.
Definition intrin_cpp.hpp:2322
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition intrin_cpp.hpp:1584
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition intrin_cpp.hpp:1961
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition intrin_cpp.hpp:1474
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< _Tp, n > v_combine_low(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from first elements of two vectors.
Definition intrin_cpp.hpp:2282
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract.
Definition intrin_cpp.hpp:2371
int saturate_cast< int >(unsigned v)
Definition saturate.hpp:138
CvSize int int int CvPoint int delta
Definition imgproc_c.h:1168
OutputArray sum
Definition imgproc.hpp:2882
T isnan(T... args)
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441