5 #ifndef OPENCV_HAL_VSX_HPP
6 #define OPENCV_HAL_VSX_HPP
9 #include "opencv2/core/utility.hpp"
12 #define CV_SIMD128_64F 1
19 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
25 typedef uchar lane_type;
29 explicit v_uint8x16(
const vec_uchar16& v) : val(v)
33 v_uint8x16(vec_bchar16 v) : val(vec_uchar16_c(v))
37 : val(vec_uchar16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
43 {
return vec_extract(val, 0); }
48 typedef schar lane_type;
52 explicit v_int8x16(
const vec_char16& v) : val(v)
56 v_int8x16(vec_bchar16 v) : val(vec_char16_c(v))
60 : val(vec_char16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
66 {
return vec_extract(val, 0); }
75 explicit v_uint16x8(
const vec_ushort8& v) : val(v)
79 v_uint16x8(vec_bshort8 v) : val(vec_ushort8_c(v))
82 : val(vec_ushort8_set(v0, v1, v2, v3, v4, v5, v6, v7))
88 {
return vec_extract(val, 0); }
93 typedef short lane_type;
97 explicit v_int16x8(
const vec_short8& v) : val(v)
101 v_int16x8(vec_bshort8 v) : val(vec_short8_c(v))
103 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
104 : val(vec_short8_set(v0, v1, v2, v3, v4, v5, v6, v7))
110 {
return vec_extract(val, 0); }
115 typedef unsigned lane_type;
119 explicit v_uint32x4(
const vec_uint4& v) : val(v)
125 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3) : val(vec_uint4_set(v0, v1, v2, v3))
131 {
return vec_extract(val, 0); }
136 typedef int lane_type;
140 explicit v_int32x4(
const vec_int4& v) : val(v)
144 v_int32x4(vec_bint4 v) : val(vec_int4_c(v))
146 v_int32x4(
int v0,
int v1,
int v2,
int v3) : val(vec_int4_set(v0, v1, v2, v3))
152 {
return vec_extract(val, 0); }
157 typedef float lane_type;
167 v_float32x4(
float v0,
float v1,
float v2,
float v3) : val(vec_float4_set(v0, v1, v2, v3))
173 {
return vec_extract(val, 0); }
182 explicit v_uint64x2(
const vec_udword2& v) : val(v)
186 v_uint64x2(vec_bdword2 v) : val(vec_udword2_c(v))
194 {
return vec_extract(val, 0); }
199 typedef int64 lane_type;
203 explicit v_int64x2(
const vec_dword2& v) : val(v)
207 v_int64x2(vec_bdword2 v) : val(vec_dword2_c(v))
215 {
return vec_extract(val, 0); }
220 typedef double lane_type;
224 explicit v_float64x2(
const vec_double2& v) : val(v)
230 v_float64x2(
double v0,
double v1) : val(vec_double2_set(v0, v1))
236 {
return vec_extract(val, 0); }
239 #define OPENCV_HAL_IMPL_VSX_EXTRACT_N(_Tpvec, _Tp) \
240 template<int i> inline _Tp v_extract_n(VSX_UNUSED(_Tpvec v)) { return vec_extract(v.val, i); }
245 OPENCV_HAL_IMPL_VSX_EXTRACT_N(
v_int16x8,
short)
247 OPENCV_HAL_IMPL_VSX_EXTRACT_N(
v_int32x4,
int)
261 #define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \
262 inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); } \
263 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \
264 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \
265 { return _Tpvec((cast)a.val); }
270 OPENCV_HAL_IMPL_VSX_INITVEC(
v_int16x8,
short, s16, vec_short8)
272 OPENCV_HAL_IMPL_VSX_INITVEC(
v_int32x4,
int, s32, vec_int4)
275 OPENCV_HAL_IMPL_VSX_INITVEC(
v_float32x4,
float, f32, vec_float4)
276 OPENCV_HAL_IMPL_VSX_INITVEC(
v_float64x2,
double, f64, vec_double2)
278 #define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a) \
279 inline _Tpvec v_load(const _Tp* ptr) \
280 { return _Tpvec(ld(0, ptr)); } \
281 inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr)) \
282 { return _Tpvec(ld_a(0, ptr)); } \
283 inline _Tpvec v_load_low(const _Tp* ptr) \
284 { return _Tpvec(vec_ld_l8(ptr)); } \
285 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
286 { return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \
287 inline void v_store(_Tp* ptr, const _Tpvec& a) \
288 { st(a.val, 0, ptr); } \
289 inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
290 { st_a(a.val, 0, ptr); } \
291 inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a) \
292 { st_a(a.val, 0, ptr); } \
293 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
294 { if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
295 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
296 { vec_st_l8(a.val, ptr); } \
297 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
298 { vec_st_h8(a.val, ptr); }
303 #ifdef CV_COMPILER_VSX_BROKEN_ALIGNED
304 #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
305 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vsx_ld, vsx_st, vsx_st)
307 #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
308 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
314 OPENCV_HAL_IMPL_VSX_LOADSTORE(
v_int16x8,
short)
316 OPENCV_HAL_IMPL_VSX_LOADSTORE(
v_int32x4,
int)
319 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(
v_float64x2,
double, vsx_ld, vsx_ld, vsx_st, vsx_st)
320 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(
v_uint64x2,
uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
321 OPENCV_HAL_IMPL_VSX_LOADSTORE_C(
v_int64x2,
int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
326 #define OPENCV_HAL_IMPL_VSX_INTERLEAVE(_Tp, _Tpvec) \
327 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
328 { vec_ld_deinterleave(ptr, a.val, b.val);} \
329 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \
330 _Tpvec& b, _Tpvec& c) \
331 { vec_ld_deinterleave(ptr, a.val, b.val, c.val); } \
332 inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \
333 _Tpvec& c, _Tpvec& d) \
334 { vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \
335 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
336 hal::StoreMode =hal::STORE_UNALIGNED) \
337 { vec_st_interleave(a.val, b.val, ptr); } \
338 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \
339 const _Tpvec& b, const _Tpvec& c, \
340 hal::StoreMode =hal::STORE_UNALIGNED) \
341 { vec_st_interleave(a.val, b.val, c.val, ptr); } \
342 inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \
343 const _Tpvec& c, const _Tpvec& d, \
344 hal::StoreMode =hal::STORE_UNALIGNED) \
345 { vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
350 OPENCV_HAL_IMPL_VSX_INTERLEAVE(
short,
v_int16x8)
352 OPENCV_HAL_IMPL_VSX_INTERLEAVE(
int,
v_int32x4)
354 OPENCV_HAL_IMPL_VSX_INTERLEAVE(
double,
v_float64x2)
359 #define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \
360 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
362 b0.val = fh(a.val); \
363 b1.val = fl(a.val); \
365 inline _Tpwvec v_expand_low(const _Tpvec& a) \
366 { return _Tpwvec(fh(a.val)); } \
367 inline _Tpwvec v_expand_high(const _Tpvec& a) \
368 { return _Tpwvec(fl(a.val)); } \
369 inline _Tpwvec v_load_expand(const _Tp* ptr) \
370 { return _Tpwvec(fh(vec_ld_l8(ptr))); }
380 #if !defined(CV_COMPILER_VSX_BROKEN_ASM)
381 #define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory");
384 #define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr));
391 vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12};
394 _LXSIWZX(out, ptr, vec_uchar16);
395 out = vec_perm(out, out, pmu);
405 _LXSIWZX(out, ptr, vec_char16);
406 outs = vec_unpackl(out);
407 outw = vec_unpackh(outs);
412 #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack) \
413 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
415 return _Tpvec(pkfnc(a.val, b.val)); \
417 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
419 vec_st_l8(pkfnc(a.val, a.val), ptr); \
422 inline _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
424 const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
425 const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
426 return _Tpvec(pkfnc(sfnc(addfnc(a.val, delta), vn), sfnc(addfnc(b.val, delta), vn))); \
429 inline void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
431 const __vector _Tpvn vn = vec_splats((_Tpvn)n); \
432 const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \
433 vec_st_l8(pkfnc(sfnc(addfnc(a.val, delta), vn), delta), ptr); \
437 vec_sr, vec_packs, vec_adds, pack)
439 vec_sra, vec_packs, vec_adds, pack)
442 vec_sr, vec_packs, vec_add, pack)
444 vec_sra, vec_packs, vec_add, pack)
447 vec_sr, vec_pack, vec_add, pack)
449 vec_sra, vec_pack, vec_add, pack)
452 vec_sra, vec_packsu, vec_adds, pack_u)
454 vec_sra, vec_packsu, vec_add, pack_u)
462 vec_uchar16 ab = vec_pack(a.val, b.val);
469 vec_ushort8 ab = vec_pack(a.val, b.val);
470 vec_ushort8 cd = vec_pack(c.val, d.val);
478 vec_uint4 ab = vec_pack(a.val, b.val);
479 vec_uint4 cd = vec_pack(c.val, d.val);
480 vec_uint4 ef = vec_pack(e.val, f.val);
481 vec_uint4 gh = vec_pack(g.val, h.val);
483 vec_ushort8 abcd = vec_pack(ab, cd);
484 vec_ushort8 efgh = vec_pack(ef, gh);
489 template <
typename _Tpvec>
490 inline void v_zip(
const _Tpvec& a0,
const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
492 b0.val = vec_mergeh(a0.val, a1.val);
493 b1.val = vec_mergel(a0.val, a1.val);
496 template <
typename _Tpvec>
498 {
return _Tpvec(vec_mergesql(a.val, b.val)); }
500 template <
typename _Tpvec>
501 inline _Tpvec
v_combine_low(
const _Tpvec& a,
const _Tpvec& b)
502 {
return _Tpvec(vec_mergesqh(a.val, b.val)); }
504 template <
typename _Tpvec>
505 inline void v_recombine(
const _Tpvec& a,
const _Tpvec& b, _Tpvec& c, _Tpvec& d)
507 c.val = vec_mergesqh(a.val, b.val);
508 d.val = vec_mergesql(a.val, b.val);
515 #define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \
516 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
517 { return _Tpvec(intrin(a.val, b.val)); } \
518 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
519 { a.val = intrin(a.val, b.val); return a; }
521 OPENCV_HAL_IMPL_VSX_BIN_OP(+,
v_uint8x16, vec_adds)
522 OPENCV_HAL_IMPL_VSX_BIN_OP(-,
v_uint8x16, vec_subs)
523 OPENCV_HAL_IMPL_VSX_BIN_OP(+,
v_int8x16, vec_adds)
524 OPENCV_HAL_IMPL_VSX_BIN_OP(-,
v_int8x16, vec_subs)
525 OPENCV_HAL_IMPL_VSX_BIN_OP(+,
v_uint16x8, vec_adds)
526 OPENCV_HAL_IMPL_VSX_BIN_OP(-,
v_uint16x8, vec_subs)
527 OPENCV_HAL_IMPL_VSX_BIN_OP(+,
v_int16x8, vec_adds)
528 OPENCV_HAL_IMPL_VSX_BIN_OP(-,
v_int16x8, vec_subs)
529 OPENCV_HAL_IMPL_VSX_BIN_OP(+,
v_uint32x4, vec_add)
530 OPENCV_HAL_IMPL_VSX_BIN_OP(-,
v_uint32x4, vec_sub)
531 OPENCV_HAL_IMPL_VSX_BIN_OP(*,
v_uint32x4, vec_mul)
532 OPENCV_HAL_IMPL_VSX_BIN_OP(+,
v_int32x4, vec_add)
533 OPENCV_HAL_IMPL_VSX_BIN_OP(-,
v_int32x4, vec_sub)
534 OPENCV_HAL_IMPL_VSX_BIN_OP(*,
v_int32x4, vec_mul)
535 OPENCV_HAL_IMPL_VSX_BIN_OP(+,
v_float32x4, vec_add)
536 OPENCV_HAL_IMPL_VSX_BIN_OP(-,
v_float32x4, vec_sub)
537 OPENCV_HAL_IMPL_VSX_BIN_OP(*,
v_float32x4, vec_mul)
538 OPENCV_HAL_IMPL_VSX_BIN_OP(/,
v_float32x4, vec_div)
539 OPENCV_HAL_IMPL_VSX_BIN_OP(+,
v_float64x2, vec_add)
540 OPENCV_HAL_IMPL_VSX_BIN_OP(-,
v_float64x2, vec_sub)
541 OPENCV_HAL_IMPL_VSX_BIN_OP(*,
v_float64x2, vec_mul)
542 OPENCV_HAL_IMPL_VSX_BIN_OP(/,
v_float64x2, vec_div)
543 OPENCV_HAL_IMPL_VSX_BIN_OP(+,
v_uint64x2, vec_add)
544 OPENCV_HAL_IMPL_VSX_BIN_OP(-,
v_uint64x2, vec_sub)
545 OPENCV_HAL_IMPL_VSX_BIN_OP(+,
v_int64x2, vec_add)
546 OPENCV_HAL_IMPL_VSX_BIN_OP(-,
v_int64x2, vec_sub)
549 #define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
550 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
553 v_mul_expand(a, b, c, d); \
554 return v_pack(c, d); \
556 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
557 { a = a * b; return a; }
564 template<
typename Tvec,
typename Twvec>
565 inline void v_mul_expand(
const Tvec& a,
const Tvec& b, Twvec& c, Twvec& d)
567 Twvec p0 = Twvec(vec_mule(a.val, b.val));
568 Twvec p1 = Twvec(vec_mulo(a.val, b.val));
574 vec_int4 p0 = vec_mule(a.val, b.val);
575 vec_int4 p1 = vec_mulo(a.val, b.val);
576 static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
577 return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm));
581 vec_uint4 p0 = vec_mule(a.val, b.val);
582 vec_uint4 p1 = vec_mulo(a.val, b.val);
583 static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
584 return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm));
588 #define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin) \
589 template<typename _Tpvec> \
590 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
591 { return _Tpvec(intrin(a.val, b.val)); }
593 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
594 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
595 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
598 #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
599 inline _Tpvec operator << (const _Tpvec& a, int imm) \
600 { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
601 inline _Tpvec operator >> (const _Tpvec& a, int imm) \
602 { return _Tpvec(shr(a.val, splfunc(imm))); } \
603 template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
604 { return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
605 template<int imm> inline _Tpvec v_shr(const _Tpvec& a) \
606 { return _Tpvec(shr(a.val, splfunc(imm))); }
608 OPENCV_HAL_IMPL_VSX_SHIFT_OP(
v_uint8x16, vec_sr, vec_uchar16_sp)
609 OPENCV_HAL_IMPL_VSX_SHIFT_OP(
v_uint16x8, vec_sr, vec_ushort8_sp)
610 OPENCV_HAL_IMPL_VSX_SHIFT_OP(
v_uint32x4, vec_sr, vec_uint4_sp)
611 OPENCV_HAL_IMPL_VSX_SHIFT_OP(
v_uint64x2, vec_sr, vec_udword2_sp)
613 OPENCV_HAL_IMPL_VSX_SHIFT_OP(
v_int8x16, vec_sra, vec_uchar16_sp)
614 OPENCV_HAL_IMPL_VSX_SHIFT_OP(
v_int16x8, vec_sra, vec_ushort8_sp)
615 OPENCV_HAL_IMPL_VSX_SHIFT_OP(
v_int32x4, vec_sra, vec_uint4_sp)
616 OPENCV_HAL_IMPL_VSX_SHIFT_OP(
v_int64x2, vec_sra, vec_udword2_sp)
619 #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
620 OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \
621 OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \
622 OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \
623 inline _Tpvec operator ~ (const _Tpvec& a) \
624 { return _Tpvec(vec_not(a.val)); }
638 #define OPENCV_HAL_IMPL_VSX_SELECT(_Tpvec, cast) \
639 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
640 { return _Tpvec(vec_sel(b.val, a.val, cast(mask.val))); }
642 OPENCV_HAL_IMPL_VSX_SELECT(
v_uint8x16, vec_bchar16_c)
643 OPENCV_HAL_IMPL_VSX_SELECT(
v_int8x16, vec_bchar16_c)
644 OPENCV_HAL_IMPL_VSX_SELECT(
v_uint16x8, vec_bshort8_c)
645 OPENCV_HAL_IMPL_VSX_SELECT(
v_int16x8, vec_bshort8_c)
646 OPENCV_HAL_IMPL_VSX_SELECT(
v_uint32x4, vec_bint4_c)
647 OPENCV_HAL_IMPL_VSX_SELECT(
v_int32x4, vec_bint4_c)
648 OPENCV_HAL_IMPL_VSX_SELECT(
v_float32x4, vec_bint4_c)
649 OPENCV_HAL_IMPL_VSX_SELECT(
v_float64x2, vec_bdword2_c)
652 #define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \
653 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
654 { return _Tpvec(vec_cmpeq(a.val, b.val)); } \
655 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
656 { return _Tpvec(vec_cmpne(a.val, b.val)); } \
657 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
658 { return _Tpvec(vec_cmplt(a.val, b.val)); } \
659 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
660 { return _Tpvec(vec_cmpgt(a.val, b.val)); } \
661 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
662 { return _Tpvec(vec_cmple(a.val, b.val)); } \
663 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
664 { return _Tpvec(vec_cmpge(a.val, b.val)); }
667 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(
v_int8x16)
669 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(
v_int16x8)
671 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(
v_int32x4)
675 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(
v_int64x2)
683 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
684 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
687 #define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \
689 inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
691 const int wd = imm * sizeof(typename _Tpvec::lane_type); \
693 return _Tpvec::zero(); \
694 return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
697 #define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \
698 OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
699 OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
701 OPENCV_IMPL_VSX_ROTATE_LR(
v_uint8x16, vec_uchar16)
702 OPENCV_IMPL_VSX_ROTATE_LR(
v_int8x16, vec_char16)
703 OPENCV_IMPL_VSX_ROTATE_LR(
v_uint16x8, vec_ushort8)
704 OPENCV_IMPL_VSX_ROTATE_LR(
v_int16x8, vec_short8)
705 OPENCV_IMPL_VSX_ROTATE_LR(
v_uint32x4, vec_uint4)
706 OPENCV_IMPL_VSX_ROTATE_LR(
v_int32x4, vec_int4)
708 OPENCV_IMPL_VSX_ROTATE_LR(
v_uint64x2, vec_udword2)
709 OPENCV_IMPL_VSX_ROTATE_LR(
v_int64x2, vec_dword2)
710 OPENCV_IMPL_VSX_ROTATE_LR(
v_float64x2, vec_double2)
712 template<
int imm,
typename _Tpvec>
713 inline _Tpvec v_rotate_right(
const _Tpvec& a,
const _Tpvec& b)
715 enum { CV_SHIFT = 16 - imm * (
sizeof(
typename _Tpvec::lane_type)) };
719 return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT & 15));
721 return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT));
725 template<
int imm,
typename _Tpvec>
726 inline _Tpvec v_rotate_left(
const _Tpvec& a,
const _Tpvec& b)
728 enum { CV_SHIFT = imm * (
sizeof(
typename _Tpvec::lane_type)) };
731 return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
734 #define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2) \
736 inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
739 return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \
740 return imm ? b : a; \
743 #define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec) \
744 OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left, b, a) \
745 OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)
749 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(
v_int64x2)
754 static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
755 vec_uchar16 vec = (vec_uchar16)a.val;
760 {
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
764 static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
765 vec_uchar16 vec = (vec_uchar16)a.val;
766 return v_reinterpret_as_u16(
v_uint8x16(vec_perm(vec, vec, perm)));
770 {
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
774 static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
775 vec_uchar16 vec = (vec_uchar16)a.val;
776 return v_reinterpret_as_u32(
v_uint8x16(vec_perm(vec, vec, perm)));
780 {
return v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
783 {
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
787 static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
788 vec_uchar16 vec = (vec_uchar16)a.val;
789 return v_reinterpret_as_u64(
v_uint8x16(vec_perm(vec, vec, perm)));
793 {
return v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
796 {
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
799 template<
int s,
typename _Tpvec>
800 inline _Tpvec
v_extract(
const _Tpvec& a,
const _Tpvec& b)
801 {
return v_rotate_right<s>(a, b); }
808 const vec_uint4 zero4 = vec_uint4_z;
809 vec_uint4 sum4 = vec_sum4s(a.val, zero4);
810 return (
uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
814 const vec_int4 zero4 = vec_int4_z;
815 vec_int4 sum4 = vec_sum4s(a.val, zero4);
816 return (
int)vec_extract(vec_sums(sum4, zero4), 3);
820 const vec_int4 zero = vec_int4_z;
825 const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
826 return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
829 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
830 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
832 const _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
833 return vec_extract(func(rs, vec_sld(rs, rs, 4)), 0); \
838 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(
v_int32x4, vec_int4,
int,
sum, vec_add)
839 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(
v_int32x4, vec_int4,
int,
max, vec_max)
840 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(
v_int32x4, vec_int4,
int,
min, vec_min)
841 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(
v_float32x4, vec_float4,
float,
sum, vec_add)
842 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(
v_float32x4, vec_float4,
float,
max, vec_max)
843 OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(
v_float32x4, vec_float4,
float,
min, vec_min)
847 return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
851 return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
855 return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
858 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \
859 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
861 _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
862 rs = func(rs, vec_sld(rs, rs, 4)); \
863 return vec_extract(func(rs, vec_sld(rs, rs, 2)), 0); \
867 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(
v_int16x8, vec_short8,
short,
max, vec_max)
868 OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(
v_int16x8, vec_short8,
short,
min, vec_min)
870 #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
871 inline scalartype v_reduce_##suffix(const _Tpvec& a) \
873 _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
874 rs = func(rs, vec_sld(rs, rs, 4)); \
875 rs = func(rs, vec_sld(rs, rs, 2)); \
876 return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0); \
886 vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val));
887 ac = vec_add(ac, vec_sld(ac, ac, 8));
889 vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val));
890 bd = vec_add(bd, vec_sld(bd, bd, 8));
896 const vec_uint4 zero4 = vec_uint4_z;
897 vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
898 return (
unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
902 const vec_int4 zero4 = vec_int4_z;
903 vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
904 vec_int4 sum4 = vec_sum4s(ad, zero4);
905 return (
unsigned)vec_extract(vec_sums(sum4, zero4), 3);
909 vec_ushort8 ad = vec_absd(a.val, b.val);
910 VSX_UNUSED(vec_int4)
sum = vec_sums(vec_int4_c(vec_unpackhu(ad)) + vec_int4_c(vec_unpacklu(ad)), vec_int4_z);
911 return (
unsigned)vec_extract(
sum, 3);
915 const vec_int4 zero4 = vec_int4_z;
916 vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
917 vec_int4 sum4 = vec_sum4s(ad, zero4);
918 return (
unsigned)vec_extract(vec_sums(sum4, zero4), 3);
922 const vec_uint4 ad = vec_absd(a.val, b.val);
923 const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
924 return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
928 vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
929 return (
unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
933 const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
934 const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
935 return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
959 static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
960 return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
963 {
return v_signmask(v_reinterpret_as_u8(a)); }
967 static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
968 return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
971 {
return v_signmask(v_reinterpret_as_s16(a)); }
975 static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
976 return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
979 {
return v_signmask(v_reinterpret_as_s32(a)); }
981 {
return v_signmask(v_reinterpret_as_s32(a)); }
985 VSX_UNUSED(
const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
986 return (
int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
989 {
return v_signmask(v_reinterpret_as_s64(a)); }
991 {
return v_signmask(v_reinterpret_as_s64(a)); }
1004 template<
typename _Tpvec>
1006 {
return vec_all_lt(a.val, _Tpvec::zero().val); }
1020 template<
typename _Tpvec>
1022 {
return vec_any_lt(a.val, _Tpvec::zero().val); }
1049 #define OPENCV_HAL_IMPL_VSX_MULADD(_Tpvec) \
1050 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1051 { return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
1052 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1053 { return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \
1054 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1055 { return _Tpvec(vec_madd(a.val, b.val, c.val)); } \
1056 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1057 { return _Tpvec(vec_madd(a.val, b.val, c.val)); }
1063 {
return a * b + c; }
1069 {
return v_uint8x16(vec_uchar16_c(vec_abs(
x.val))); }
1072 {
return v_uint16x8(vec_ushort8_c(vec_abs(
x.val))); }
1075 {
return v_uint32x4(vec_uint4_c(vec_abs(
x.val))); }
1085 OPENCV_HAL_IMPL_VSX_BIN_FUNC(
v_absdiff, vec_absd)
1088 {
return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1090 {
return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1092 {
return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
1095 {
return v_abs(a - b); }
1097 {
return v_abs(a - b); }
1101 {
return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
1103 {
return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }
1109 {
return v_int32x4(vec_cts(vec_rint(a.val))); }
1112 {
return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
1115 {
return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
1118 {
return v_int32x4(vec_cts(vec_floor(a.val))); }
1121 {
return v_int32x4(vec_mergesqo(vec_ctso(vec_floor(a.val)), vec_int4_z)); }
1124 {
return v_int32x4(vec_cts(vec_ceil(a.val))); }
1127 {
return v_int32x4(vec_mergesqo(vec_ctso(vec_ceil(a.val)), vec_int4_z)); }
1133 {
return v_int32x4(vec_mergesqo(vec_ctso(a.val), vec_int4_z)); }
1140 {
return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
1143 {
return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
1146 {
return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
1149 {
return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
1152 {
return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
1155 {
return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
1169 return v_reinterpret_as_s8(
v_int16x8(*(
const short*)(tab+
idx[0]), *(
const short*)(tab+
idx[1]), *(
const short*)(tab+
idx[2]), *(
const short*)(tab+
idx[3]),
1170 *(
const short*)(tab+
idx[4]), *(
const short*)(tab+
idx[5]), *(
const short*)(tab+
idx[6]), *(
const short*)(tab+
idx[7])));
1174 return v_reinterpret_as_s8(
v_int32x4(*(
const int*)(tab+
idx[0]), *(
const int*)(tab+
idx[1]), *(
const int*)(tab+
idx[2]), *(
const int*)(tab+
idx[3])));
1186 return v_reinterpret_as_s16(
v_int32x4(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]), *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3])));
1220 inline v_uint64x2 v_lut(
const uint64_t* tab,
const int*
idx) {
return v_reinterpret_as_u64(
v_lut((
const int64_t *)tab,
idx)); }
1238 const int idx[4] = {
1239 vec_extract(idxvec.val, 0),
1240 vec_extract(idxvec.val, 1),
1241 vec_extract(idxvec.val, 2),
1242 vec_extract(idxvec.val, 3)
1249 const int idx[4] = {
1250 vec_extract(idxvec.val, 0),
1251 vec_extract(idxvec.val, 1),
1252 vec_extract(idxvec.val, 2),
1253 vec_extract(idxvec.val, 3)
1260 const int idx[4] = {
1261 vec_extract(idxvec.val, 0),
1262 vec_extract(idxvec.val, 1),
1263 vec_extract(idxvec.val, 2),
1264 vec_extract(idxvec.val, 3)
1271 const int idx[2] = {
1272 vec_extract(idxvec.val, 0),
1273 vec_extract(idxvec.val, 1)
1280 vec_float4 xy0 = vec_ld_l8(tab + vec_extract(idxvec.val, 0));
1281 vec_float4 xy1 = vec_ld_l8(tab + vec_extract(idxvec.val, 1));
1282 vec_float4 xy2 = vec_ld_l8(tab + vec_extract(idxvec.val, 2));
1283 vec_float4 xy3 = vec_ld_l8(tab + vec_extract(idxvec.val, 3));
1284 vec_float4 xy02 = vec_mergeh(xy0, xy2);
1285 vec_float4 xy13 = vec_mergeh(xy1, xy3);
1286 x.val = vec_mergeh(xy02, xy13);
1287 y.val = vec_mergel(xy02, xy13);
1291 vec_double2 xy0 = vsx_ld(vec_extract(idxvec.val, 0), tab);
1292 vec_double2 xy1 = vsx_ld(vec_extract(idxvec.val, 1), tab);
1293 x.val = vec_mergeh(xy0, xy1);
1294 y.val = vec_mergel(xy0, xy1);
1299 static const vec_uchar16 perm = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15};
1300 return v_int8x16(vec_perm(vec.val, vec.val, perm));
1307 static const vec_uchar16 perm = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
1308 return v_int8x16(vec_perm(vec.val, vec.val, perm));
1315 static const vec_uchar16 perm = {0,1, 4,5, 2,3, 6,7, 8,9, 12,13, 10,11, 14,15};
1316 return v_int16x8(vec_perm(vec.val, vec.val, perm));
1323 static const vec_uchar16 perm = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
1324 return v_int16x8(vec_perm(vec.val, vec.val, perm));
1331 static const vec_uchar16 perm = {0,1,2,3, 8,9,10,11, 4,5,6,7, 12,13,14,15};
1332 return v_int32x4(vec_perm(vec.val, vec.val, perm));
1341 static const vec_uchar16 perm = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 15, 15, 15};
1342 return v_int8x16(vec_perm(vec.val, vec.val, perm));
1345 {
return v_reinterpret_as_u8(
v_pack_triplets(v_reinterpret_as_s8(vec))); }
1349 static const vec_uchar16 perm = {0,1, 2,3, 4,5, 8,9, 10,11, 12,13, 14,15, 14,15};
1350 return v_int16x8(vec_perm(vec.val, vec.val, perm));
1353 {
return v_reinterpret_as_u16(
v_pack_triplets(v_reinterpret_as_s16(vec))); }
1366 vec_ushort8 vf16 = vec_ld_l8((
const ushort*)ptr);
1367 #if CV_VSX3 && defined(vec_extract_fp_from_shorth)
1368 return v_float32x4(vec_extract_fp_from_shorth(vf16));
1369 #elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
1371 __asm__ __volatile__ (
"xvcvhpsp %x0,%x1" :
"=wa" (vf32) :
"wa" (vec_mergeh(vf16, vf16)));
1374 const vec_int4 z = vec_int4_z,
delta = vec_int4_sp(0x38000000);
1375 const vec_int4 signmask = vec_int4_sp(0x80000000);
1376 const vec_int4 maxexp = vec_int4_sp(0x7c000000);
1377 const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
1379 vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
1380 vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
1381 vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)),
delta);
1382 vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
1384 t = vec_add(t, vec_and(
delta, vec_cmpeq(maxexp, e)));
1385 vec_bint4 zmask = vec_cmpeq(e, z);
1386 vec_int4 ft = vec_sel(t, zt, zmask);
1387 return v_float32x4(vec_float4_c(vec_or(ft, sign)));
1394 #if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
1396 __asm__ __volatile__ (
"xvcvsphp %x0,%x1" :
"=wa" (vf16) :
"wa" (v.val));
1397 vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
1399 const vec_int4 signmask = vec_int4_sp(0x80000000);
1400 const vec_int4 rval = vec_int4_sp(0x3f000000);
1402 vec_int4 t = vec_int4_c(v.val);
1403 vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
1404 t = vec_and(vec_nor(signmask, signmask), t);
1406 vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
1407 vec_bint4
isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
1408 vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
1409 vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
1410 vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
1411 tt = vec_sub(tt, rval);
1412 vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
1413 vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
1414 nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
1415 t = vec_sel(nt, tt, tinymask);
1416 t = vec_sel(naninf, t, finitemask);
1417 t = vec_or(t, sign);
1418 vec_st_l8(vec_packs(t, t), ptr);
1433 {
return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
1435 {
return v_int32x4(vec_msum(a.val, b.val, c.val)); }
1440 vec_dword2 even = vec_mule(a.val, b.val);
1441 vec_dword2 odd = vec_mulo(a.val, b.val);
1449 {
return v_uint32x4(vec_msum(a.val, b.val, c.val)); }
1451 {
return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); }
1455 const vec_ushort8 eight = vec_ushort8_sp(8);
1456 vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight);
1457 vec_short8 a1 = vec_sra((vec_short8)a.val, eight);
1458 vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
1459 vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
1460 return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
1465 const vec_ushort8 eight = vec_ushort8_sp(8);
1466 vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight);
1467 vec_short8 a1 = vec_sra((vec_short8)a.val, eight);
1468 vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
1469 vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
1470 return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val)));
1476 const vec_uint4 zero = vec_uint4_z;
1477 vec_uint4 even = vec_mule(a.val, b.val);
1478 vec_uint4 odd = vec_mulo(a.val, b.val);
1479 vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero);
1480 vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero);
1481 vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero);
1482 vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero);
1483 vec_udword2 s0 = vec_add(e0, o0);
1484 vec_udword2 s1 = vec_add(e1, o1);
1495 return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
1512 {
return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
1523 {
return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
1527 vec_short8 a0 = vec_unpackh(a.val);
1528 vec_short8 a1 = vec_unpackl(a.val);
1529 vec_short8 b0 = vec_unpackh(b.val);
1530 vec_short8 b1 = vec_unpackl(b.val);
1531 return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
1562 const vec_float4 v0 = vec_splat(v.val, 0);
1563 const vec_float4 v1 = vec_splat(v.val, 1);
1564 const vec_float4 v2 = vec_splat(v.val, 2);
1565 VSX_UNUSED(
const vec_float4) v3 = vec_splat(v.val, 3);
1566 return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
1573 const vec_float4 v0 = vec_splat(v.val, 0);
1574 const vec_float4 v1 = vec_splat(v.val, 1);
1575 const vec_float4 v2 = vec_splat(v.val, 2);
1576 return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
1579 #define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
1580 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1581 const _Tpvec& a2, const _Tpvec& a3, \
1582 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1584 _Tpvec2 a02 = vec_mergeh(a0.val, a2.val); \
1585 _Tpvec2 a13 = vec_mergeh(a1.val, a3.val); \
1586 b0.val = vec_mergeh(a02, a13); \
1587 b1.val = vec_mergel(a02, a13); \
1588 a02 = vec_mergel(a0.val, a2.val); \
1589 a13 = vec_mergel(a1.val, a3.val); \
1590 b2.val = vec_mergeh(a02, a13); \
1591 b3.val = vec_mergel(a02, a13); \
1593 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(
v_uint32x4, vec_uint4)
1594 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(
v_int32x4, vec_int4)
1595 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(
v_float32x4, vec_float4)
1597 template<
int i,
typename Tvec>
1599 {
return Tvec(vec_splat(v.val, i)); }
1602 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition: intrin_cpp.hpp:1554
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition: intrin_cpp.hpp:1584
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
v_reg< _Tp, n > v_combine_high(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from last elements of two vectors.
Definition: intrin_cpp.hpp:2304
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
void v_recombine(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< _Tp, n > &low, v_reg< _Tp, n > &high)
Combine two vectors from lower and higher parts of two other vectors.
Definition: intrin_cpp.hpp:2322
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
v_reg< _Tp, n > v_combine_low(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Combine vector from first elements of two vectors.
Definition: intrin_cpp.hpp:2282
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract.
Definition: intrin_cpp.hpp:2371
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
int saturate_cast< int >(unsigned v)
Definition: saturate.hpp:138
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
OutputArray sum
Definition: imgproc.hpp:2882
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437