45 #ifndef OPENCV_HAL_SSE_HPP
46 #define OPENCV_HAL_SSE_HPP
49 #include "opencv2/core/utility.hpp"
52 #define CV_SIMD128_64F 1
53 #define CV_SIMD128_FP16 0
68 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
74 typedef uchar lane_type;
75 typedef __m128i vector_type;
84 val = _mm_setr_epi8((
char)v0, (
char)v1, (
char)v2, (
char)v3,
85 (
char)v4, (
char)v5, (
char)v6, (
char)v7,
86 (
char)v8, (
char)v9, (
char)v10, (
char)v11,
87 (
char)v12, (
char)v13, (
char)v14, (
char)v15);
92 return (
uchar)_mm_cvtsi128_si32(val);
100 typedef schar lane_type;
101 typedef __m128i vector_type;
102 enum { nlanes = 16 };
106 explicit v_int8x16(__m128i v) : val(v) {}
110 val = _mm_setr_epi8((
char)v0, (
char)v1, (
char)v2, (
char)v3,
111 (
char)v4, (
char)v5, (
char)v6, (
char)v7,
112 (
char)v8, (
char)v9, (
char)v10, (
char)v11,
113 (
char)v12, (
char)v13, (
char)v14, (
char)v15);
118 return (
schar)_mm_cvtsi128_si32(val);
127 typedef __m128i vector_type;
135 val = _mm_setr_epi16((
short)v0, (
short)v1, (
short)v2, (
short)v3,
136 (
short)v4, (
short)v5, (
short)v6, (
short)v7);
141 return (
ushort)_mm_cvtsi128_si32(val);
149 typedef short lane_type;
150 typedef __m128i vector_type;
155 explicit v_int16x8(__m128i v) : val(v) {}
156 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
158 val = _mm_setr_epi16((
short)v0, (
short)v1, (
short)v2, (
short)v3,
159 (
short)v4, (
short)v5, (
short)v6, (
short)v7);
164 return (
short)_mm_cvtsi128_si32(val);
172 typedef unsigned lane_type;
173 typedef __m128i vector_type;
179 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
181 val = _mm_setr_epi32((
int)v0, (
int)v1, (
int)v2, (
int)v3);
184 unsigned get0()
const
186 return (
unsigned)_mm_cvtsi128_si32(val);
194 typedef int lane_type;
195 typedef __m128i vector_type;
200 explicit v_int32x4(__m128i v) : val(v) {}
201 v_int32x4(
int v0,
int v1,
int v2,
int v3)
203 val = _mm_setr_epi32(v0, v1, v2, v3);
208 return _mm_cvtsi128_si32(val);
216 typedef float lane_type;
217 typedef __m128 vector_type;
223 v_float32x4(
float v0,
float v1,
float v2,
float v3)
225 val = _mm_setr_ps(v0, v1, v2, v3);
230 return _mm_cvtss_f32(val);
239 typedef __m128i vector_type;
247 #if defined(_MSC_VER) && _MSC_VER >= 1920 && defined(_M_X64) && !defined(__clang__)
248 val = _mm_setr_epi64x((int64_t)v0, (int64_t)v1);
249 #elif defined(__GNUC__)
250 val = _mm_setr_epi64((__m64)v0, (__m64)v1);
252 val = _mm_setr_epi32((
int)v0, (
int)(v0 >> 32), (
int)v1, (
int)(v1 >> 32));
258 #if !defined(__x86_64__) && !defined(_M_X64)
259 int a = _mm_cvtsi128_si32(val);
260 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
261 return (
unsigned)a | ((
uint64)(
unsigned)b << 32);
263 return (
uint64)_mm_cvtsi128_si64(val);
272 typedef int64 lane_type;
273 typedef __m128i vector_type;
278 explicit v_int64x2(__m128i v) : val(v) {}
281 #if defined(_MSC_VER) && _MSC_VER >= 1920 && defined(_M_X64) && !defined(__clang__)
282 val = _mm_setr_epi64x((int64_t)v0, (int64_t)v1);
283 #elif defined(__GNUC__)
284 val = _mm_setr_epi64((__m64)v0, (__m64)v1);
286 val = _mm_setr_epi32((
int)v0, (
int)(v0 >> 32), (
int)v1, (
int)(v1 >> 32));
292 #if !defined(__x86_64__) && !defined(_M_X64)
293 int a = _mm_cvtsi128_si32(val);
294 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
295 return (
int64)((unsigned)a | ((
uint64)(unsigned)b << 32));
297 return _mm_cvtsi128_si64(val);
306 typedef double lane_type;
307 typedef __m128d vector_type;
315 val = _mm_setr_pd(v0, v1);
320 return _mm_cvtsd_f64(val);
326 namespace hal_sse_internal
328 template <
typename to_sse_type,
typename from_sse_type>
329 to_sse_type v_sse_reinterpret_as(
const from_sse_type& val);
331 #define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
333 to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
334 { return sse_cast_intrin(a); }
336 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
337 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
338 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
339 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
340 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
341 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
342 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
343 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
344 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
347 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
348 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
349 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
350 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
351 { return _Tpvec(cast(a.val)); }
355 OPENCV_HAL_IMPL_SSE_INITVEC(
v_uint16x8,
ushort, u16, si128, epi16,
short, OPENCV_HAL_NOP)
356 OPENCV_HAL_IMPL_SSE_INITVEC(
v_int16x8,
short, s16, si128, epi16,
short, OPENCV_HAL_NOP)
357 OPENCV_HAL_IMPL_SSE_INITVEC(
v_uint32x4,
unsigned, u32, si128, epi32,
int, OPENCV_HAL_NOP)
358 OPENCV_HAL_IMPL_SSE_INITVEC(
v_int32x4,
int, s32, si128, epi32,
int, OPENCV_HAL_NOP)
359 OPENCV_HAL_IMPL_SSE_INITVEC(
v_float32x4,
float, f32, ps, ps,
float, _mm_castsi128_ps)
360 OPENCV_HAL_IMPL_SSE_INITVEC(
v_float64x2,
double, f64, pd, pd,
double, _mm_castsi128_pd)
367 template<
typename _Tpvec>
inline
369 template<
typename _Tpvec>
inline
380 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
381 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
382 { return _Tpvec(_mm_castps_si128(a.val)); } \
383 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
384 { return _Tpvec(_mm_castpd_si128(a.val)); }
386 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_uint8x16, u8)
387 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_int8x16, s8)
388 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_uint16x8, u16)
389 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_int16x8, s16)
390 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_uint32x4, u32)
391 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_int32x4, s32)
392 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_uint64x2, u64)
393 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(
v_int64x2, s64)
403 __m128i
delta = _mm_set1_epi16(255);
404 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val,
delta)),
405 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val,
delta))));
410 __m128i
delta = _mm_set1_epi16(255);
411 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val,
delta));
412 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
416 {
return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
419 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
421 template<
int n>
inline
425 __m128i
delta = _mm_set1_epi16((
short)(1 << (n-1)));
426 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val,
delta), n),
427 _mm_srli_epi16(_mm_adds_epu16(b.val,
delta), n)));
430 template<
int n>
inline
433 __m128i
delta = _mm_set1_epi16((
short)(1 << (n-1)));
434 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val,
delta), n);
435 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
438 template<
int n>
inline
441 __m128i
delta = _mm_set1_epi16((
short)(1 << (n-1)));
442 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val,
delta), n),
443 _mm_srai_epi16(_mm_adds_epi16(b.val,
delta), n)));
446 template<
int n>
inline
449 __m128i
delta = _mm_set1_epi16((
short)(1 << (n-1)));
450 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val,
delta), n);
451 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
455 {
return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
458 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
460 template<
int n>
inline
464 __m128i
delta = _mm_set1_epi16((
short)(1 << (n-1)));
465 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val,
delta), n),
466 _mm_srai_epi16(_mm_adds_epi16(b.val,
delta), n)));
468 template<
int n>
inline
472 __m128i
delta = _mm_set1_epi16((
short)(1 << (n-1)));
473 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val,
delta), n);
474 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
479 inline __m128i v_select_si128(__m128i
mask, __m128i a, __m128i b)
482 return _mm_blendv_epi8(b, a,
mask);
484 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b),
mask));
489 {
return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
493 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
494 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
495 __m128i
r = _mm_packs_epi32(a1, a1);
496 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(
r, _mm_set1_epi16(-32768)));
499 template<
int n>
inline
502 __m128i
delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
503 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val,
delta), n), delta32);
504 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val,
delta), n), delta32);
505 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
508 template<
int n>
inline
511 __m128i
delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
512 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val,
delta), n), delta32);
513 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
514 _mm_storel_epi64((__m128i*)ptr, a2);
520 return v_uint16x8(_mm_packus_epi32(a.val, b.val));
522 __m128i delta32 = _mm_set1_epi32(32768);
525 __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
526 __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
528 __m128i
r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
529 return v_uint16x8(_mm_sub_epi16(
r, _mm_set1_epi16(-32768)));
536 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val));
538 __m128i delta32 = _mm_set1_epi32(32768);
539 __m128i a1 = _mm_sub_epi32(a.val, delta32);
540 __m128i
r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
541 _mm_storel_epi64((__m128i*)ptr,
r);
545 template<
int n>
inline
549 __m128i
delta = _mm_set1_epi32(1 << (n - 1));
550 return v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val,
delta), n),
551 _mm_srai_epi32(_mm_add_epi32(b.val,
delta), n)));
553 __m128i
delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
554 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val,
delta), n), delta32);
555 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
556 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val,
delta), n), delta32);
557 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
558 return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
562 template<
int n>
inline
566 __m128i
delta = _mm_set1_epi32(1 << (n - 1));
567 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val,
delta), n);
568 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1));
570 __m128i
delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
571 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val,
delta), n), delta32);
572 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
573 _mm_storel_epi64((__m128i*)ptr, a2);
578 {
return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
582 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
585 template<
int n>
inline
588 __m128i
delta = _mm_set1_epi32(1 << (n-1));
589 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val,
delta), n),
590 _mm_srai_epi32(_mm_add_epi32(b.val,
delta), n)));
593 template<
int n>
inline
594 void v_rshr_pack_store(
short* ptr,
const v_int32x4& a)
596 __m128i
delta = _mm_set1_epi32(1 << (n-1));
597 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val,
delta), n);
598 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
605 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
606 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
607 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
612 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
613 _mm_storel_epi64((__m128i*)ptr, a1);
619 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
620 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
621 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
626 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
627 _mm_storel_epi64((__m128i*)ptr, a1);
630 template<
int n>
inline
635 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
636 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
637 __m128i v0 = _mm_unpacklo_epi32(a1, b1);
638 __m128i v1 = _mm_unpackhi_epi32(a1, b1);
639 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
642 template<
int n>
inline
643 void v_rshr_pack_store(
unsigned* ptr,
const v_uint64x2& a)
647 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
648 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
649 _mm_storel_epi64((__m128i*)ptr, a2);
652 inline __m128i v_sign_epi64(__m128i a)
654 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1));
657 inline __m128i v_srai_epi64(__m128i a,
int imm)
659 __m128i smask = v_sign_epi64(a);
660 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
663 template<
int n>
inline
668 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
669 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
670 __m128i v0 = _mm_unpacklo_epi32(a1, b1);
671 __m128i v1 = _mm_unpackhi_epi32(a1, b1);
672 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
675 template<
int n>
inline
676 void v_rshr_pack_store(
int* ptr,
const v_int64x2& a)
680 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
681 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
682 _mm_storel_epi64((__m128i*)ptr, a2);
688 __m128i ab = _mm_packs_epi16(a.val, b.val);
695 __m128i ab = _mm_packs_epi32(a.val, b.val);
696 __m128i cd = _mm_packs_epi32(c.val, d.val);
704 __m128i ab = _mm_packs_epi32(a.val, b.val);
705 __m128i cd = _mm_packs_epi32(c.val, d.val);
706 __m128i ef = _mm_packs_epi32(e.val, f.val);
707 __m128i gh = _mm_packs_epi32(g.val, h.val);
709 __m128i abcd = _mm_packs_epi32(ab, cd);
710 __m128i efgh = _mm_packs_epi32(ef, gh);
711 return v_uint8x16(_mm_packs_epi16(abcd, efgh));
718 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
719 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
720 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
721 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
723 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
730 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
731 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
732 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
734 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
737 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
738 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
740 return _Tpvec(intrin(a.val, b.val)); \
742 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
744 a.val = intrin(a.val, b.val); \
748 OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_uint8x16, _mm_adds_epu8)
749 OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_uint8x16, _mm_subs_epu8)
750 OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_int8x16, _mm_adds_epi8)
751 OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_int8x16, _mm_subs_epi8)
752 OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_uint16x8, _mm_adds_epu16)
753 OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_uint16x8, _mm_subs_epu16)
754 OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_int16x8, _mm_adds_epi16)
755 OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_int16x8, _mm_subs_epi16)
756 OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_uint32x4, _mm_add_epi32)
757 OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_uint32x4, _mm_sub_epi32)
758 OPENCV_HAL_IMPL_SSE_BIN_OP(*,
v_uint32x4, _v128_mullo_epi32)
759 OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_int32x4, _mm_add_epi32)
760 OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_int32x4, _mm_sub_epi32)
761 OPENCV_HAL_IMPL_SSE_BIN_OP(*,
v_int32x4, _v128_mullo_epi32)
762 OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_float32x4, _mm_add_ps)
763 OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_float32x4, _mm_sub_ps)
764 OPENCV_HAL_IMPL_SSE_BIN_OP(*,
v_float32x4, _mm_mul_ps)
765 OPENCV_HAL_IMPL_SSE_BIN_OP(/,
v_float32x4, _mm_div_ps)
766 OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_float64x2, _mm_add_pd)
767 OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_float64x2, _mm_sub_pd)
768 OPENCV_HAL_IMPL_SSE_BIN_OP(*,
v_float64x2, _mm_mul_pd)
769 OPENCV_HAL_IMPL_SSE_BIN_OP(/,
v_float64x2, _mm_div_pd)
770 OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_uint64x2, _mm_add_epi64)
771 OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_uint64x2, _mm_sub_epi64)
772 OPENCV_HAL_IMPL_SSE_BIN_OP(+,
v_int64x2, _mm_add_epi64)
773 OPENCV_HAL_IMPL_SSE_BIN_OP(-,
v_int64x2, _mm_sub_epi64)
776 #define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
777 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
780 v_mul_expand(a, b, c, d); \
781 return v_pack(c, d); \
783 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
784 { a = a * b; return a; }
798 c = v_mul_wrap(a0, b0);
799 d = v_mul_wrap(a1, b1);
808 c = v_mul_wrap(a0, b0);
809 d = v_mul_wrap(a1, b1);
815 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
816 __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
817 c.val = _mm_unpacklo_epi16(v0, v1);
818 d.val = _mm_unpackhi_epi16(v0, v1);
824 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
825 __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
826 c.val = _mm_unpacklo_epi16(v0, v1);
827 d.val = _mm_unpackhi_epi16(v0, v1);
833 __m128i c0 = _mm_mul_epu32(a.val, b.val);
834 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
835 c.val = _mm_unpacklo_epi64(c0, c1);
836 d.val = _mm_unpackhi_epi64(c0, c1);
846 {
return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
854 __m128i even = _mm_mul_epi32(a.val, b.val);
855 __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
856 return v_int64x2(_mm_add_epi64(even, odd));
858 __m128i even_u = _mm_mul_epu32(a.val, b.val);
859 __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
861 __m128i a_sign = _mm_srai_epi32(a.val, 31);
862 __m128i b_sign = _mm_srai_epi32(b.val, 31);
864 __m128i axb = _mm_and_si128(a.val, b_sign);
865 __m128i bxa = _mm_and_si128(b.val, a_sign);
867 __m128i ssum = _mm_add_epi32(bxa, axb);
868 __m128i even_ssum = _mm_slli_epi64(ssum, 32);
869 __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
871 return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
880 __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8);
881 __m128i a1 = _mm_srli_epi16(a.val, 8);
882 __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
883 __m128i b1 = _mm_srli_epi16(b.val, 8);
884 __m128i p0 = _mm_madd_epi16(a0, b0);
885 __m128i p1 = _mm_madd_epi16(a1, b1);
893 __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8);
894 __m128i a1 = _mm_srai_epi16(a.val, 8);
895 __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
896 __m128i b1 = _mm_srai_epi16(b.val, 8);
897 __m128i p0 = _mm_madd_epi16(a0, b0);
898 __m128i p1 = _mm_madd_epi16(a1, b1);
916 _mm_unpacklo_epi64(c0.val, d0.val),
917 _mm_unpackhi_epi64(c0.val, d0.val)
929 _mm_unpacklo_epi64(c.val, d.val),
930 _mm_unpackhi_epi64(c.val, d.val)
946 _mm_unpacklo_pd(c.val, d.val),
947 _mm_unpackhi_pd(c.val, d.val)
975 __m128i p0 = _mm_madd_epi16(a0, b0);
976 __m128i p1 = _mm_madd_epi16(a1, b1);
985 __m128i a0 = _mm_cvtepi8_epi16(a.val);
987 __m128i b0 = _mm_cvtepi8_epi16(b.val);
989 __m128i p0 = _mm_madd_epi16(a0, b0);
990 __m128i p1 = _mm_madd_epi16(a1, b1);
1032 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
1033 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
1034 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
1035 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
1036 inline _Tpvec operator ~ (const _Tpvec& a) \
1038 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
1041 OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_uint8x16, si128, _mm_set1_epi32(-1))
1042 OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_int8x16, si128, _mm_set1_epi32(-1))
1043 OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_uint16x8, si128, _mm_set1_epi32(-1))
1044 OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_int16x8, si128, _mm_set1_epi32(-1))
1045 OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_uint32x4, si128, _mm_set1_epi32(-1))
1046 OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_int32x4, si128, _mm_set1_epi32(-1))
1047 OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_uint64x2, si128, _mm_set1_epi32(-1))
1048 OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_int64x2, si128, _mm_set1_epi32(-1))
1049 OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
1050 OPENCV_HAL_IMPL_SSE_LOGIC_OP(
v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
1057 const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
1059 __m128 h = _mm_mul_ps(t, _0_5);
1060 t = _mm_rsqrt_ps(t);
1061 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
1070 const __m128d v_1 = _mm_set1_pd(1.);
1071 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(
x.val)));
1074 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
1075 inline _Tpuvec v_abs(const _Tpsvec& x) \
1076 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
1082 __m128i s = _mm_srli_epi32(
x.val, 31);
1083 __m128i f = _mm_srai_epi32(
x.val, 31);
1084 return v_uint32x4(_mm_add_epi32(_mm_xor_si128(
x.val, f), s));
1087 {
return v_float32x4(_mm_and_ps(
x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
1091 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
1096 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
1097 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1099 return _Tpvec(intrin(a.val, b.val)); \
1102 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint8x16, v_min, _mm_min_epu8)
1103 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint8x16, v_max, _mm_max_epu8)
1104 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int16x8, v_min, _mm_min_epi16)
1105 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int16x8, v_max, _mm_max_epi16)
1106 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_float32x4, v_min, _mm_min_ps)
1107 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_float32x4, v_max, _mm_max_ps)
1108 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_float64x2, v_min, _mm_min_pd)
1109 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_float64x2, v_max, _mm_max_pd)
1114 return v_int8x16(_mm_min_epi8(a.val, b.val));
1116 __m128i
delta = _mm_set1_epi8((
char)-128);
1118 _mm_xor_si128(b.val,
delta))));
1124 return v_int8x16(_mm_max_epi8(a.val, b.val));
1126 __m128i
delta = _mm_set1_epi8((
char)-128);
1128 _mm_xor_si128(b.val,
delta))));
1134 return v_uint16x8(_mm_min_epu16(a.val, b.val));
1136 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
1142 return v_uint16x8(_mm_max_epu16(a.val, b.val));
1144 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
1150 return v_uint32x4(_mm_min_epu32(a.val, b.val));
1152 __m128i
delta = _mm_set1_epi32((
int)0x80000000);
1153 __m128i
mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val,
delta), _mm_xor_si128(b.val,
delta));
1160 return v_uint32x4(_mm_max_epu32(a.val, b.val));
1162 __m128i
delta = _mm_set1_epi32((
int)0x80000000);
1163 __m128i
mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val,
delta), _mm_xor_si128(b.val,
delta));
1170 return v_int32x4(_mm_min_epi32(a.val, b.val));
1172 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
1178 return v_int32x4(_mm_max_epi32(a.val, b.val));
1180 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
1184 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
1185 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
1186 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1187 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
1189 __m128i not_mask = _mm_set1_epi32(-1); \
1190 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1192 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1193 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1194 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
1196 __m128i not_mask = _mm_set1_epi32(-1); \
1197 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1199 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
1201 __m128i smask = _mm_set1_##suffix(sbit); \
1202 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
1204 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
1206 __m128i smask = _mm_set1_##suffix(sbit); \
1207 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
1209 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
1211 __m128i smask = _mm_set1_##suffix(sbit); \
1212 __m128i not_mask = _mm_set1_epi32(-1); \
1213 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
1214 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1216 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
1218 __m128i smask = _mm_set1_##suffix(sbit); \
1219 __m128i not_mask = _mm_set1_epi32(-1); \
1220 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
1221 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1223 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
1225 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
1227 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1229 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
1231 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
1233 __m128i not_mask = _mm_set1_epi32(-1); \
1234 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
1236 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
1238 __m128i not_mask = _mm_set1_epi32(-1); \
1239 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
1246 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
1247 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1248 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1249 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1250 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
1251 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1252 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
1253 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1254 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
1255 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1256 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
1257 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1258 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
1264 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1265 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1266 { return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
1267 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1268 { return ~(a == b); }
1270 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1271 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1272 { __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
1273 return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
1274 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1275 { return ~(a == b); }
1279 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(
v_int64x2)
1282 {
return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
1284 {
return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
1286 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint8x16, v_add_wrap, _mm_add_epi8)
1287 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int8x16, v_add_wrap, _mm_add_epi8)
1288 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint16x8, v_add_wrap, _mm_add_epi16)
1289 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int16x8, v_add_wrap, _mm_add_epi16)
1290 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint8x16, v_sub_wrap, _mm_sub_epi8)
1291 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int8x16, v_sub_wrap, _mm_sub_epi8)
1292 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint16x8, v_sub_wrap, _mm_sub_epi16)
1293 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int16x8, v_sub_wrap, _mm_sub_epi16)
1294 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
1295 OPENCV_HAL_IMPL_SSE_BIN_FUNC(
v_int16x8, v_mul_wrap, _mm_mullo_epi16)
1299 __m128i ad = _mm_srai_epi16(a.val, 8);
1300 __m128i bd = _mm_srai_epi16(b.val, 8);
1301 __m128i p0 = _mm_mullo_epi16(a.val, b.val);
1302 __m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8);
1303 const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
1304 return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
1308 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
1314 {
return v_add_wrap(a - b, b - a); }
1316 {
return v_add_wrap(a - b, b - a); }
1318 {
return v_max(a, b) - v_min(a, b); }
1324 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1328 return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1334 return v_reinterpret_as_u32((d ^ m) - m);
1345 {
return v_max(a, b) - v_min(a, b); }
1355 return v_fma(a, b, c);
1361 return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
1363 return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
1370 return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
1372 return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
1376 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
1377 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1379 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
1380 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
1382 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1384 _Tpvec res = v_fma(a, a, b*b); \
1385 return _Tpvec(_mm_sqrt_##suffix(res.val)); \
1387 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1389 return v_fma(a, a, b*b); \
1391 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1393 return v_fma(a, b, c); \
1396 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(
v_float32x4,
float, __m128, ps, _mm_set1_epi32((
int)0x7fffffff))
1397 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(
v_float64x2,
double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
1399 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
1400 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1402 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1404 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1406 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1408 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1410 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1412 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1414 return _Tpsvec(srai(a.val, imm)); \
1417 inline _Tpuvec v_shl(const _Tpuvec& a) \
1419 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1422 inline _Tpsvec v_shl(const _Tpsvec& a) \
1424 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1427 inline _Tpuvec v_shr(const _Tpuvec& a) \
1429 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1432 inline _Tpsvec v_shr(const _Tpsvec& a) \
1434 return _Tpsvec(srai(a.val, imm)); \
1441 namespace hal_sse_internal
1444 bool is_invalid = ((imm < 0) || (imm > 16)),
1445 bool is_first = (imm == 0),
1446 bool is_half = (imm == 8),
1447 bool is_second = (imm == 16),
1448 bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
1449 class v_sse_palignr_u8_class;
1452 class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
1455 class v_sse_palignr_u8_class<imm, false, true, false, false, false>
1458 inline __m128i operator()(
const __m128i& a,
const __m128i&)
const
1465 class v_sse_palignr_u8_class<imm, false, false, true, false, false>
1468 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
1470 return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
1475 class v_sse_palignr_u8_class<imm, false, false, false, true, false>
1478 inline __m128i operator()(
const __m128i&,
const __m128i& b)
const
1485 class v_sse_palignr_u8_class<imm, false, false, false, false, true>
1489 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
1491 return _mm_alignr_epi8(b, a, imm);
1495 inline __m128i operator()(
const __m128i& a,
const __m128i& b)
const
1497 enum { imm2 = (
sizeof(__m128i) - imm) };
1498 return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
1504 inline __m128i v_sse_palignr_u8(
const __m128i& a,
const __m128i& b)
1506 CV_StaticAssert((imm >= 0) && (imm <= 16),
"Invalid imm for v_sse_palignr_u8.");
1507 return v_sse_palignr_u8_class<imm>()(a, b);
1511 template<
int imm,
typename _Tpvec>
1512 inline _Tpvec v_rotate_right(
const _Tpvec &a)
1514 using namespace hal_sse_internal;
1515 enum { imm2 = (imm *
sizeof(
typename _Tpvec::lane_type)) };
1516 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1518 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1521 template<
int imm,
typename _Tpvec>
1522 inline _Tpvec v_rotate_left(
const _Tpvec &a)
1524 using namespace hal_sse_internal;
1525 enum { imm2 = (imm *
sizeof(
typename _Tpvec::lane_type)) };
1526 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1528 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1531 template<
int imm,
typename _Tpvec>
1532 inline _Tpvec v_rotate_right(
const _Tpvec &a,
const _Tpvec &b)
1534 using namespace hal_sse_internal;
1535 enum { imm2 = (imm *
sizeof(
typename _Tpvec::lane_type)) };
1536 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1537 v_sse_palignr_u8<imm2>(
1538 v_sse_reinterpret_as<__m128i>(a.val),
1539 v_sse_reinterpret_as<__m128i>(b.val))));
1542 template<
int imm,
typename _Tpvec>
1543 inline _Tpvec v_rotate_left(
const _Tpvec &a,
const _Tpvec &b)
1545 using namespace hal_sse_internal;
1546 enum { imm2 = ((_Tpvec::nlanes - imm) *
sizeof(
typename _Tpvec::lane_type)) };
1547 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1548 v_sse_palignr_u8<imm2>(
1549 v_sse_reinterpret_as<__m128i>(b.val),
1550 v_sse_reinterpret_as<__m128i>(a.val))));
1553 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1554 inline _Tpvec v_load(const _Tp* ptr) \
1555 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1556 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1557 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1558 inline _Tpvec v_load_low(const _Tp* ptr) \
1559 { return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1560 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1562 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1563 _mm_loadl_epi64((const __m128i*)ptr1))); \
1565 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1566 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
1567 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1568 { _mm_store_si128((__m128i*)ptr, a.val); } \
1569 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1570 { _mm_stream_si128((__m128i*)ptr, a.val); } \
1571 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1573 if( mode == hal::STORE_UNALIGNED ) \
1574 _mm_storeu_si128((__m128i*)ptr, a.val); \
1575 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1576 _mm_stream_si128((__m128i*)ptr, a.val); \
1578 _mm_store_si128((__m128i*)ptr, a.val); \
1580 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1581 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
1582 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1583 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1588 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_int16x8,
short)
1589 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_uint32x4,
unsigned)
1590 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(
v_int32x4,
int)
1594 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1595 inline _Tpvec v_load(const _Tp* ptr) \
1596 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1597 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1598 { return _Tpvec(_mm_load_##suffix(ptr)); } \
1599 inline _Tpvec v_load_low(const _Tp* ptr) \
1600 { return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1601 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1603 return _Tpvec(_mm_castsi128_##suffix( \
1604 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1605 _mm_loadl_epi64((const __m128i*)ptr1)))); \
1607 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1608 { _mm_storeu_##suffix(ptr, a.val); } \
1609 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1610 { _mm_store_##suffix(ptr, a.val); } \
1611 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1612 { _mm_stream_##suffix(ptr, a.val); } \
1613 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1615 if( mode == hal::STORE_UNALIGNED ) \
1616 _mm_storeu_##suffix(ptr, a.val); \
1617 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1618 _mm_stream_##suffix(ptr, a.val); \
1620 _mm_store_##suffix(ptr, a.val); \
1622 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1623 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1624 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1626 __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1627 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1630 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(
v_float32x4,
float, ps)
1631 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(
v_float64x2,
double, pd)
1635 __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
1636 return (
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1640 __m128i half = _mm_set1_epi8((
schar)-128);
1641 half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
1642 return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
1644 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
1645 inline schar v_reduce_##func(const v_int8x16& a) \
1647 __m128i val = a.val; \
1648 __m128i smask = _mm_set1_epi8((schar)-128); \
1649 val = _mm_xor_si128(val, smask); \
1650 val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1651 val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1652 val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1653 val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1654 return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
1656 inline uchar v_reduce_##func(const v_uint8x16& a) \
1658 __m128i val = a.val; \
1659 val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1660 val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1661 val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1662 val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1663 return (uchar)_mm_cvtsi128_si32(val); \
1665 OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(
max)
1666 OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(
min)
1668 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1669 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1671 __m128i val = a.val; \
1672 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1673 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1674 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1675 return (scalartype)_mm_cvtsi128_si32(val); \
1677 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1679 __m128i val = a.val; \
1680 __m128i smask = _mm_set1_epi16(sbit); \
1681 val = _mm_xor_si128(val, smask); \
1682 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1683 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1684 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1685 return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
1687 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8,
short,
max, epi16, (
short)-32768)
1688 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8,
short,
min, epi16, (
short)-32768)
1690 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1691 inline scalartype v_reduce_sum(const _Tpvec& a) \
1693 regtype val = a.val; \
1694 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1695 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1696 return (scalartype)_mm_cvt##extract(val); \
1699 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1700 inline scalartype v_reduce_##func(const _Tpvec& a) \
1702 scalartype CV_DECL_ALIGNED(16) buf[4]; \
1703 v_store_aligned(buf, a); \
1704 scalartype s0 = scalar_func(buf[0], buf[1]); \
1705 scalartype s1 = scalar_func(buf[2], buf[3]); \
1706 return scalar_func(s0, s1); \
1709 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(
v_uint32x4,
unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1710 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(
v_int32x4,
int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1711 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(
v_float32x4,
float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1741 __m128 ab = _mm_hadd_ps(a.val, b.val);
1742 __m128 cd = _mm_hadd_ps(c.val, d.val);
1745 __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1746 __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1747 return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1760 __m128i half = _mm_sad_epu8(a.val, b.val);
1761 return (
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1765 __m128i half = _mm_set1_epi8(0x7f);
1766 half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
1767 return (
unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1796 __m128i m1 = _mm_set1_epi32(0x55555555);
1797 __m128i m2 = _mm_set1_epi32(0x33333333);
1798 __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
1800 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
1801 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
1802 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
1808 p += v_rotate_right<1>(p);
1809 return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1814 p += v_rotate_right<1>(p);
1815 p += v_rotate_right<2>(p);
1816 return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1823 {
return v_popcount(v_reinterpret_as_u8(a)); }
1825 {
return v_popcount(v_reinterpret_as_u16(a)); }
1827 {
return v_popcount(v_reinterpret_as_u32(a)); }
1829 {
return v_popcount(v_reinterpret_as_u64(a)); }
1831 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
1832 inline int v_signmask(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)); } \
1833 inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
1834 inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
1835 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
1836 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
1837 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_uint32x4, ps, _mm_castsi128_ps, 15)
1838 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_int32x4, ps, _mm_castsi128_ps, 15)
1839 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_uint64x2, pd, _mm_castsi128_pd, 3)
1840 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_int64x2, pd, _mm_castsi128_pd, 3)
1841 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_float32x4, ps, OPENCV_HAL_NOP, 15)
1842 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(
v_float64x2, pd, OPENCV_HAL_NOP, 3)
1844 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
1845 inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
1846 inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
1847 inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
1848 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(
v_uint16x8)
1849 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(
v_int16x8)
1863 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
1864 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1866 return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
1869 OPENCV_HAL_IMPL_SSE_SELECT(
v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1870 OPENCV_HAL_IMPL_SSE_SELECT(
v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1871 OPENCV_HAL_IMPL_SSE_SELECT(
v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1872 OPENCV_HAL_IMPL_SSE_SELECT(
v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1873 OPENCV_HAL_IMPL_SSE_SELECT(
v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1874 OPENCV_HAL_IMPL_SSE_SELECT(
v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1877 OPENCV_HAL_IMPL_SSE_SELECT(
v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
1878 OPENCV_HAL_IMPL_SSE_SELECT(
v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
1882 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1883 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1885 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1888 OPENCV_HAL_IMPL_SSE_SELECT(
v_uint8x16, si128)
1889 OPENCV_HAL_IMPL_SSE_SELECT(
v_int8x16, si128)
1890 OPENCV_HAL_IMPL_SSE_SELECT(
v_uint16x8, si128)
1891 OPENCV_HAL_IMPL_SSE_SELECT(
v_int16x8, si128)
1892 OPENCV_HAL_IMPL_SSE_SELECT(
v_uint32x4, si128)
1893 OPENCV_HAL_IMPL_SSE_SELECT(
v_int32x4, si128)
1901 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1902 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1904 b0.val = intrin(a.val); \
1905 b1.val = __CV_CAT(intrin, _high)(a.val); \
1907 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1908 { return _Tpwvec(intrin(a.val)); } \
1909 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1910 { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1911 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1913 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1914 return _Tpwvec(intrin(a)); \
1924 #define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin) \
1925 inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1927 typedef int CV_DECL_ALIGNED(1) unaligned_int; \
1928 __m128i a = _mm_cvtsi32_si128(*(const unaligned_int*)ptr); \
1929 return _Tpvec(intrin(a)); \
1933 OPENCV_HAL_IMPL_SSE_EXPAND_Q(
v_int32x4,
schar, _v128_cvtepi8_epi32)
1935 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1936 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1938 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1939 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1941 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1943 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1944 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1946 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1948 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1949 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1951 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1953 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1954 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1955 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1958 OPENCV_HAL_IMPL_SSE_UNPACKS(
v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1959 OPENCV_HAL_IMPL_SSE_UNPACKS(
v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1960 OPENCV_HAL_IMPL_SSE_UNPACKS(
v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1961 OPENCV_HAL_IMPL_SSE_UNPACKS(
v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1962 OPENCV_HAL_IMPL_SSE_UNPACKS(
v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1963 OPENCV_HAL_IMPL_SSE_UNPACKS(
v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1964 OPENCV_HAL_IMPL_SSE_UNPACKS(
v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1965 OPENCV_HAL_IMPL_SSE_UNPACKS(
v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1970 static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1971 return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
1975 return
v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
1980 {
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
1985 static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1986 return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
1988 __m128i
r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
1989 r = _mm_shufflelo_epi16(
r, _MM_SHUFFLE(2, 3, 0, 1));
1990 r = _mm_shufflehi_epi16(
r, _MM_SHUFFLE(2, 3, 0, 1));
1996 {
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
2000 return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
2004 {
return v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
2007 {
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
2011 return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
2015 {
return v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
2018 {
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
2020 template<
int s,
typename _Tpvec>
2021 inline _Tpvec
v_extract(
const _Tpvec& a,
const _Tpvec& b)
2023 return v_rotate_right<s>(a, b);
2027 {
return v_int32x4(_mm_cvtps_epi32(a.val)); }
2031 __m128i a1 = _mm_cvtps_epi32(a.val);
2032 __m128i
mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
2038 __m128i a1 = _mm_cvtps_epi32(a.val);
2039 __m128i
mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
2044 {
return v_int32x4(_mm_cvttps_epi32(a.val)); }
2047 {
return v_int32x4(_mm_cvtpd_epi32(a.val)); }
2051 __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
2052 return v_int32x4(_mm_unpacklo_epi64(ai, bi));
2057 __m128i a1 = _mm_cvtpd_epi32(a.val);
2058 __m128i
mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
2059 mask = _mm_srli_si128(_mm_slli_si128(
mask, 4), 8);
2065 __m128i a1 = _mm_cvtpd_epi32(a.val);
2066 __m128i
mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
2067 mask = _mm_srli_si128(_mm_slli_si128(
mask, 4), 8);
2072 {
return v_int32x4(_mm_cvttpd_epi32(a.val)); }
2074 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
2075 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2076 const _Tpvec& a2, const _Tpvec& a3, \
2077 _Tpvec& b0, _Tpvec& b1, \
2078 _Tpvec& b2, _Tpvec& b3) \
2080 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
2081 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
2082 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
2083 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
2085 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
2086 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
2087 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
2088 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
2091 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(
v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2092 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(
v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2093 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(
v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
2098 __m128i t00 = _mm_loadu_si128((
const __m128i*)ptr);
2099 __m128i t01 = _mm_loadu_si128((
const __m128i*)(ptr + 16));
2101 __m128i t10 = _mm_unpacklo_epi8(t00, t01);
2102 __m128i t11 = _mm_unpackhi_epi8(t00, t01);
2104 __m128i t20 = _mm_unpacklo_epi8(t10, t11);
2105 __m128i t21 = _mm_unpackhi_epi8(t10, t11);
2107 __m128i t30 = _mm_unpacklo_epi8(t20, t21);
2108 __m128i t31 = _mm_unpackhi_epi8(t20, t21);
2110 a.val = _mm_unpacklo_epi8(t30, t31);
2111 b.val = _mm_unpackhi_epi8(t30, t31);
2117 const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2118 const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2119 __m128i s0 = _mm_loadu_si128((
const __m128i*)ptr);
2120 __m128i s1 = _mm_loadu_si128((
const __m128i*)(ptr + 16));
2121 __m128i s2 = _mm_loadu_si128((
const __m128i*)(ptr + 32));
2122 __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
2123 __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
2124 __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
2125 const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
2126 const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
2127 const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2128 a0 = _mm_shuffle_epi8(a0, sh_b);
2129 b0 = _mm_shuffle_epi8(b0, sh_g);
2130 c0 = _mm_shuffle_epi8(c0, sh_r);
2135 const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
2136 const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
2137 const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
2139 __m128i t0 = _mm_loadu_si128((
const __m128i*)ptr);
2140 __m128i t1 = _mm_loadu_si128((
const __m128i*)(ptr + 16));
2141 __m128i t2 = _mm_loadu_si128((
const __m128i*)(ptr + 32));
2143 __m128i s0 = _mm_shuffle_epi8(t0, m0);
2144 __m128i s1 = _mm_shuffle_epi8(t1, m1);
2145 __m128i s2 = _mm_shuffle_epi8(t2, m2);
2147 t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
2148 a.val = _mm_alignr_epi8(s2, t0, 5);
2150 t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
2151 b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
2153 t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
2154 c.val = _mm_alignr_epi8(t2, s0, 11);
2156 __m128i t00 = _mm_loadu_si128((
const __m128i*)ptr);
2157 __m128i t01 = _mm_loadu_si128((
const __m128i*)(ptr + 16));
2158 __m128i t02 = _mm_loadu_si128((
const __m128i*)(ptr + 32));
2160 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
2161 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
2162 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
2164 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
2165 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
2166 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
2168 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
2169 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
2170 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
2172 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
2173 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
2174 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
2180 __m128i u0 = _mm_loadu_si128((
const __m128i*)ptr);
2181 __m128i u1 = _mm_loadu_si128((
const __m128i*)(ptr + 16));
2182 __m128i u2 = _mm_loadu_si128((
const __m128i*)(ptr + 32));
2183 __m128i u3 = _mm_loadu_si128((
const __m128i*)(ptr + 48));
2185 __m128i v0 = _mm_unpacklo_epi8(u0, u2);
2186 __m128i v1 = _mm_unpackhi_epi8(u0, u2);
2187 __m128i v2 = _mm_unpacklo_epi8(u1, u3);
2188 __m128i v3 = _mm_unpackhi_epi8(u1, u3);
2190 u0 = _mm_unpacklo_epi8(v0, v2);
2191 u1 = _mm_unpacklo_epi8(v1, v3);
2192 u2 = _mm_unpackhi_epi8(v0, v2);
2193 u3 = _mm_unpackhi_epi8(v1, v3);
2195 v0 = _mm_unpacklo_epi8(u0, u1);
2196 v1 = _mm_unpacklo_epi8(u2, u3);
2197 v2 = _mm_unpackhi_epi8(u0, u1);
2198 v3 = _mm_unpackhi_epi8(u2, u3);
2200 a.val = _mm_unpacklo_epi8(v0, v1);
2201 b.val = _mm_unpackhi_epi8(v0, v1);
2202 c.val = _mm_unpacklo_epi8(v2, v3);
2203 d.val = _mm_unpackhi_epi8(v2, v3);
2208 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2209 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
2211 __m128i v2 = _mm_unpacklo_epi16(v0, v1);
2212 __m128i v3 = _mm_unpackhi_epi16(v0, v1);
2213 __m128i v4 = _mm_unpacklo_epi16(v2, v3);
2214 __m128i v5 = _mm_unpackhi_epi16(v2, v3);
2216 a.val = _mm_unpacklo_epi16(v4, v5);
2217 b.val = _mm_unpackhi_epi16(v4, v5);
2223 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2224 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
2225 __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
2226 __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
2227 __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
2228 __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
2230 const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2231 const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2232 const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2233 a0 = _mm_shuffle_epi8(a0, sh_a);
2234 b0 = _mm_shuffle_epi8(b0, sh_b);
2235 c0 = _mm_shuffle_epi8(c0, sh_c);
2241 __m128i t00 = _mm_loadu_si128((
const __m128i*)ptr);
2242 __m128i t01 = _mm_loadu_si128((
const __m128i*)(ptr + 8));
2243 __m128i t02 = _mm_loadu_si128((
const __m128i*)(ptr + 16));
2245 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
2246 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
2247 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
2249 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
2250 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
2251 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
2253 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
2254 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
2255 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
2261 __m128i u0 = _mm_loadu_si128((
const __m128i*)ptr);
2262 __m128i u1 = _mm_loadu_si128((
const __m128i*)(ptr + 8));
2263 __m128i u2 = _mm_loadu_si128((
const __m128i*)(ptr + 16));
2264 __m128i u3 = _mm_loadu_si128((
const __m128i*)(ptr + 24));
2266 __m128i v0 = _mm_unpacklo_epi16(u0, u2);
2267 __m128i v1 = _mm_unpackhi_epi16(u0, u2);
2268 __m128i v2 = _mm_unpacklo_epi16(u1, u3);
2269 __m128i v3 = _mm_unpackhi_epi16(u1, u3);
2271 u0 = _mm_unpacklo_epi16(v0, v2);
2272 u1 = _mm_unpacklo_epi16(v1, v3);
2273 u2 = _mm_unpackhi_epi16(v0, v2);
2274 u3 = _mm_unpackhi_epi16(v1, v3);
2276 a.val = _mm_unpacklo_epi16(u0, u1);
2277 b.val = _mm_unpackhi_epi16(u0, u1);
2278 c.val = _mm_unpacklo_epi16(u2, u3);
2279 d.val = _mm_unpackhi_epi16(u2, u3);
2284 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2285 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4));
2287 __m128i v2 = _mm_unpacklo_epi32(v0, v1);
2288 __m128i v3 = _mm_unpackhi_epi32(v0, v1);
2290 a.val = _mm_unpacklo_epi32(v2, v3);
2291 b.val = _mm_unpackhi_epi32(v2, v3);
2296 __m128i t00 = _mm_loadu_si128((
const __m128i*)ptr);
2297 __m128i t01 = _mm_loadu_si128((
const __m128i*)(ptr + 4));
2298 __m128i t02 = _mm_loadu_si128((
const __m128i*)(ptr + 8));
2300 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
2301 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
2302 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
2304 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
2305 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
2306 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
2311 v_uint32x4 s0(_mm_loadu_si128((
const __m128i*)ptr));
2312 v_uint32x4 s1(_mm_loadu_si128((
const __m128i*)(ptr + 4)));
2313 v_uint32x4 s2(_mm_loadu_si128((
const __m128i*)(ptr + 8)));
2314 v_uint32x4 s3(_mm_loadu_si128((
const __m128i*)(ptr + 12)));
2321 __m128 u0 = _mm_loadu_ps(ptr);
2322 __m128 u1 = _mm_loadu_ps((ptr + 4));
2324 a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2325 b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1));
2330 __m128 t0 = _mm_loadu_ps(ptr + 0);
2331 __m128 t1 = _mm_loadu_ps(ptr + 4);
2332 __m128 t2 = _mm_loadu_ps(ptr + 8);
2334 __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
2335 a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
2337 __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
2338 __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
2339 b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
2341 __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
2342 c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
2347 __m128 t0 = _mm_loadu_ps(ptr + 0);
2348 __m128 t1 = _mm_loadu_ps(ptr + 4);
2349 __m128 t2 = _mm_loadu_ps(ptr + 8);
2350 __m128 t3 = _mm_loadu_ps(ptr + 12);
2351 __m128 t02lo = _mm_unpacklo_ps(t0, t2);
2352 __m128 t13lo = _mm_unpacklo_ps(t1, t3);
2353 __m128 t02hi = _mm_unpackhi_ps(t0, t2);
2354 __m128 t13hi = _mm_unpackhi_ps(t1, t3);
2355 a.val = _mm_unpacklo_ps(t02lo, t13lo);
2356 b.val = _mm_unpackhi_ps(t02lo, t13lo);
2357 c.val = _mm_unpacklo_ps(t02hi, t13hi);
2358 d.val = _mm_unpackhi_ps(t02hi, t13hi);
2363 __m128i t0 = _mm_loadu_si128((
const __m128i*)ptr);
2364 __m128i t1 = _mm_loadu_si128((
const __m128i*)(ptr + 2));
2372 __m128i t0 = _mm_loadu_si128((
const __m128i*)ptr);
2373 __m128i t1 = _mm_loadu_si128((
const __m128i*)(ptr + 2));
2374 __m128i t2 = _mm_loadu_si128((
const __m128i*)(ptr + 4));
2376 t1 = _mm_shuffle_epi32(t1, 0x4e);
2379 b =
v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
2386 __m128i t0 = _mm_loadu_si128((
const __m128i*)ptr);
2387 __m128i t1 = _mm_loadu_si128((
const __m128i*)(ptr + 2));
2388 __m128i t2 = _mm_loadu_si128((
const __m128i*)(ptr + 4));
2389 __m128i t3 = _mm_loadu_si128((
const __m128i*)(ptr + 6));
2402 __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
2403 __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
2407 _mm_stream_si128((__m128i*)(ptr), v0);
2408 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2412 _mm_store_si128((__m128i*)(ptr), v0);
2413 _mm_store_si128((__m128i*)(ptr + 16), v1);
2417 _mm_storeu_si128((__m128i*)(ptr), v0);
2418 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2426 const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2427 const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2428 const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2429 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2430 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2431 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2433 const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2434 const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2435 __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
2436 __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
2437 __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
2439 const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
2440 const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
2441 const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
2443 __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
2444 t0 = _mm_alignr_epi8(c.val, t0, 5);
2445 __m128i v0 = _mm_shuffle_epi8(t0, m0);
2447 __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
2448 t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
2449 __m128i v1 = _mm_shuffle_epi8(t1, m1);
2451 __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
2452 t2 = _mm_alignr_epi8(t2, a.val, 11);
2453 __m128i v2 = _mm_shuffle_epi8(t2, m2);
2455 __m128i z = _mm_setzero_si128();
2456 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
2457 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
2458 __m128i c0 = _mm_unpacklo_epi8(c.val, z);
2459 __m128i c1 = _mm_unpackhi_epi8(c.val, z);
2461 __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
2462 __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
2463 __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
2464 __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
2466 __m128i p10 = _mm_unpacklo_epi32(p00, p01);
2467 __m128i p11 = _mm_unpackhi_epi32(p00, p01);
2468 __m128i p12 = _mm_unpacklo_epi32(p02, p03);
2469 __m128i p13 = _mm_unpackhi_epi32(p02, p03);
2471 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2472 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2473 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2474 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2476 p20 = _mm_slli_si128(p20, 1);
2477 p22 = _mm_slli_si128(p22, 1);
2479 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
2480 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
2481 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
2482 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
2484 __m128i p40 = _mm_unpacklo_epi64(p30, p31);
2485 __m128i p41 = _mm_unpackhi_epi64(p30, p31);
2486 __m128i p42 = _mm_unpacklo_epi64(p32, p33);
2487 __m128i p43 = _mm_unpackhi_epi64(p32, p33);
2489 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
2490 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
2491 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
2496 _mm_stream_si128((__m128i*)(ptr), v0);
2497 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2498 _mm_stream_si128((__m128i*)(ptr + 32), v2);
2502 _mm_store_si128((__m128i*)(ptr), v0);
2503 _mm_store_si128((__m128i*)(ptr + 16), v1);
2504 _mm_store_si128((__m128i*)(ptr + 32), v2);
2508 _mm_storeu_si128((__m128i*)(ptr), v0);
2509 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2510 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2522 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val);
2523 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val);
2524 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val);
2525 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val);
2527 __m128i v0 = _mm_unpacklo_epi8(u0, u2);
2528 __m128i v1 = _mm_unpackhi_epi8(u0, u2);
2529 __m128i v2 = _mm_unpacklo_epi8(u1, u3);
2530 __m128i v3 = _mm_unpackhi_epi8(u1, u3);
2534 _mm_stream_si128((__m128i*)(ptr), v0);
2535 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2536 _mm_stream_si128((__m128i*)(ptr + 32), v2);
2537 _mm_stream_si128((__m128i*)(ptr + 48), v3);
2541 _mm_store_si128((__m128i*)(ptr), v0);
2542 _mm_store_si128((__m128i*)(ptr + 16), v1);
2543 _mm_store_si128((__m128i*)(ptr + 32), v2);
2544 _mm_store_si128((__m128i*)(ptr + 48), v3);
2548 _mm_storeu_si128((__m128i*)(ptr), v0);
2549 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2550 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2551 _mm_storeu_si128((__m128i*)(ptr + 48), v3);
2558 __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
2559 __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
2563 _mm_stream_si128((__m128i*)(ptr), v0);
2564 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2568 _mm_store_si128((__m128i*)(ptr), v0);
2569 _mm_store_si128((__m128i*)(ptr + 8), v1);
2573 _mm_storeu_si128((__m128i*)(ptr), v0);
2574 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2583 const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2584 const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2585 const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2586 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2587 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2588 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2590 __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
2591 __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
2592 __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
2594 __m128i z = _mm_setzero_si128();
2595 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
2596 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
2597 __m128i c0 = _mm_unpacklo_epi16(c.val, z);
2598 __m128i c1 = _mm_unpackhi_epi16(c.val, z);
2600 __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
2601 __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
2602 __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
2603 __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
2605 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2606 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2607 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2608 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2610 p20 = _mm_slli_si128(p20, 2);
2611 p22 = _mm_slli_si128(p22, 2);
2613 __m128i p30 = _mm_unpacklo_epi64(p20, p21);
2614 __m128i p31 = _mm_unpackhi_epi64(p20, p21);
2615 __m128i p32 = _mm_unpacklo_epi64(p22, p23);
2616 __m128i p33 = _mm_unpackhi_epi64(p22, p23);
2618 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
2619 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
2620 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
2624 _mm_stream_si128((__m128i*)(ptr), v0);
2625 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2626 _mm_stream_si128((__m128i*)(ptr + 16), v2);
2630 _mm_store_si128((__m128i*)(ptr), v0);
2631 _mm_store_si128((__m128i*)(ptr + 8), v1);
2632 _mm_store_si128((__m128i*)(ptr + 16), v2);
2636 _mm_storeu_si128((__m128i*)(ptr), v0);
2637 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2638 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2650 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val);
2651 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val);
2652 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val);
2653 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val);
2655 __m128i v0 = _mm_unpacklo_epi16(u0, u2);
2656 __m128i v1 = _mm_unpackhi_epi16(u0, u2);
2657 __m128i v2 = _mm_unpacklo_epi16(u1, u3);
2658 __m128i v3 = _mm_unpackhi_epi16(u1, u3);
2662 _mm_stream_si128((__m128i*)(ptr), v0);
2663 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2664 _mm_stream_si128((__m128i*)(ptr + 16), v2);
2665 _mm_stream_si128((__m128i*)(ptr + 24), v3);
2669 _mm_store_si128((__m128i*)(ptr), v0);
2670 _mm_store_si128((__m128i*)(ptr + 8), v1);
2671 _mm_store_si128((__m128i*)(ptr + 16), v2);
2672 _mm_store_si128((__m128i*)(ptr + 24), v3);
2676 _mm_storeu_si128((__m128i*)(ptr), v0);
2677 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2678 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2679 _mm_storeu_si128((__m128i*)(ptr + 24), v3);
2686 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
2687 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
2691 _mm_stream_si128((__m128i*)(ptr), v0);
2692 _mm_stream_si128((__m128i*)(ptr + 4), v1);
2696 _mm_store_si128((__m128i*)(ptr), v0);
2697 _mm_store_si128((__m128i*)(ptr + 4), v1);
2701 _mm_storeu_si128((__m128i*)(ptr), v0);
2702 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2709 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
2712 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
2713 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
2714 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
2718 _mm_stream_si128((__m128i*)(ptr), v0);
2719 _mm_stream_si128((__m128i*)(ptr + 4), v1);
2720 _mm_stream_si128((__m128i*)(ptr + 8), v2);
2724 _mm_store_si128((__m128i*)(ptr), v0);
2725 _mm_store_si128((__m128i*)(ptr + 4), v1);
2726 _mm_store_si128((__m128i*)(ptr + 8), v2);
2730 _mm_storeu_si128((__m128i*)(ptr), v0);
2731 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2732 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2745 _mm_stream_si128((__m128i*)(ptr), v0.val);
2746 _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
2747 _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
2748 _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
2752 _mm_store_si128((__m128i*)(ptr), v0.val);
2753 _mm_store_si128((__m128i*)(ptr + 4), v1.val);
2754 _mm_store_si128((__m128i*)(ptr + 8), v2.val);
2755 _mm_store_si128((__m128i*)(ptr + 12), v3.val);
2759 _mm_storeu_si128((__m128i*)(ptr), v0.val);
2760 _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
2761 _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
2762 _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
2770 __m128 v0 = _mm_unpacklo_ps(a.val, b.val);
2771 __m128 v1 = _mm_unpackhi_ps(a.val, b.val);
2775 _mm_stream_ps(ptr, v0);
2776 _mm_stream_ps(ptr + 4, v1);
2780 _mm_store_ps(ptr, v0);
2781 _mm_store_ps(ptr + 4, v1);
2785 _mm_storeu_ps(ptr, v0);
2786 _mm_storeu_ps(ptr + 4, v1);
2793 __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
2794 __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
2795 __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2796 __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
2797 __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
2798 __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
2799 __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
2800 __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
2801 __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
2805 _mm_stream_ps(ptr, v0);
2806 _mm_stream_ps(ptr + 4, v1);
2807 _mm_stream_ps(ptr + 8, v2);
2811 _mm_store_ps(ptr, v0);
2812 _mm_store_ps(ptr + 4, v1);
2813 _mm_store_ps(ptr + 8, v2);
2817 _mm_storeu_ps(ptr, v0);
2818 _mm_storeu_ps(ptr + 4, v1);
2819 _mm_storeu_ps(ptr + 8, v2);
2827 __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
2828 __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
2829 __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
2830 __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
2831 __m128 v0 = _mm_unpacklo_ps(u0, u1);
2832 __m128 v2 = _mm_unpacklo_ps(u2, u3);
2833 __m128 v1 = _mm_unpackhi_ps(u0, u1);
2834 __m128 v3 = _mm_unpackhi_ps(u2, u3);
2838 _mm_stream_ps(ptr, v0);
2839 _mm_stream_ps(ptr + 4, v1);
2840 _mm_stream_ps(ptr + 8, v2);
2841 _mm_stream_ps(ptr + 12, v3);
2845 _mm_store_ps(ptr, v0);
2846 _mm_store_ps(ptr + 4, v1);
2847 _mm_store_ps(ptr + 8, v2);
2848 _mm_store_ps(ptr + 12, v3);
2852 _mm_storeu_ps(ptr, v0);
2853 _mm_storeu_ps(ptr + 4, v1);
2854 _mm_storeu_ps(ptr + 8, v2);
2855 _mm_storeu_ps(ptr + 12, v3);
2862 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2863 __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
2867 _mm_stream_si128((__m128i*)(ptr), v0);
2868 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2872 _mm_store_si128((__m128i*)(ptr), v0);
2873 _mm_store_si128((__m128i*)(ptr + 2), v1);
2877 _mm_storeu_si128((__m128i*)(ptr), v0);
2878 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2885 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2886 __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
2887 __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
2891 _mm_stream_si128((__m128i*)(ptr), v0);
2892 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2893 _mm_stream_si128((__m128i*)(ptr + 4), v2);
2897 _mm_store_si128((__m128i*)(ptr), v0);
2898 _mm_store_si128((__m128i*)(ptr + 2), v1);
2899 _mm_store_si128((__m128i*)(ptr + 4), v2);
2903 _mm_storeu_si128((__m128i*)(ptr), v0);
2904 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2905 _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2913 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2914 __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
2915 __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
2916 __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
2920 _mm_stream_si128((__m128i*)(ptr), v0);
2921 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2922 _mm_stream_si128((__m128i*)(ptr + 4), v2);
2923 _mm_stream_si128((__m128i*)(ptr + 6), v3);
2927 _mm_store_si128((__m128i*)(ptr), v0);
2928 _mm_store_si128((__m128i*)(ptr + 2), v1);
2929 _mm_store_si128((__m128i*)(ptr + 4), v2);
2930 _mm_store_si128((__m128i*)(ptr + 6), v3);
2934 _mm_storeu_si128((__m128i*)(ptr), v0);
2935 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2936 _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2937 _mm_storeu_si128((__m128i*)(ptr + 6), v3);
2941 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2942 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2945 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2946 a0 = v_reinterpret_as_##suffix0(a1); \
2947 b0 = v_reinterpret_as_##suffix0(b1); \
2949 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2951 _Tpvec1 a1, b1, c1; \
2952 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2953 a0 = v_reinterpret_as_##suffix0(a1); \
2954 b0 = v_reinterpret_as_##suffix0(b1); \
2955 c0 = v_reinterpret_as_##suffix0(c1); \
2957 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2959 _Tpvec1 a1, b1, c1, d1; \
2960 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2961 a0 = v_reinterpret_as_##suffix0(a1); \
2962 b0 = v_reinterpret_as_##suffix0(b1); \
2963 c0 = v_reinterpret_as_##suffix0(c1); \
2964 d0 = v_reinterpret_as_##suffix0(d1); \
2966 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2967 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2969 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2970 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2971 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2973 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2974 const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2976 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2977 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2978 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2979 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2981 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2982 const _Tpvec0& c0, const _Tpvec0& d0, \
2983 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2985 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2986 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2987 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2988 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2989 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
3010 return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
3020 return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
3030 return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
3037 __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000);
3038 __m128i magic_i_all = _mm_set1_epi64x(0x4530000080100000);
3039 __m128d magic_d_all = _mm_castsi128_pd(magic_i_all);
3042 __m128i magic_i_lo = _mm_set1_epi64x(0x4330000000000000);
3043 __m128i v_lo = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
3045 __m128i magic_i_lo = _mm_set1_epi32(0x43300000);
3046 __m128i v_lo = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
3049 __m128i v_hi = _mm_srli_epi64(v.val, 32);
3051 v_hi = _mm_xor_si128(v_hi, magic_i_hi32);
3053 __m128d v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
3055 __m128d
result = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
3063 #if defined(_MSC_VER)
3068 _mm_setr_pi8(tab[
idx[0]], tab[
idx[1]], tab[
idx[ 2]], tab[
idx[ 3]], tab[
idx[ 4]], tab[
idx[ 5]], tab[
idx[ 6]], tab[
idx[ 7]]),
3069 _mm_setr_pi8(tab[
idx[8]], tab[
idx[9]], tab[
idx[10]], tab[
idx[11]], tab[
idx[12]], tab[
idx[13]], tab[
idx[14]], tab[
idx[15]])
3075 #if defined(_MSC_VER)
3076 return v_int8x16(_mm_setr_epi16(*(
const short*)(tab +
idx[0]), *(
const short*)(tab +
idx[1]), *(
const short*)(tab +
idx[2]), *(
const short*)(tab +
idx[3]),
3077 *(
const short*)(tab +
idx[4]), *(
const short*)(tab +
idx[5]), *(
const short*)(tab +
idx[6]), *(
const short*)(tab +
idx[7])));
3080 _mm_setr_pi16(*(
const short*)(tab +
idx[0]), *(
const short*)(tab +
idx[1]), *(
const short*)(tab +
idx[2]), *(
const short*)(tab +
idx[3])),
3081 _mm_setr_pi16(*(
const short*)(tab +
idx[4]), *(
const short*)(tab +
idx[5]), *(
const short*)(tab +
idx[6]), *(
const short*)(tab +
idx[7]))
3087 #if defined(_MSC_VER)
3088 return v_int8x16(_mm_setr_epi32(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
3089 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3])));
3092 _mm_setr_pi32(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1])),
3093 _mm_setr_pi32(*(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3]))
3103 #if defined(_MSC_VER)
3108 _mm_setr_pi16(tab[
idx[0]], tab[
idx[1]], tab[
idx[2]], tab[
idx[3]]),
3109 _mm_setr_pi16(tab[
idx[4]], tab[
idx[5]], tab[
idx[6]], tab[
idx[7]])
3115 #if defined(_MSC_VER)
3116 return v_int16x8(_mm_setr_epi32(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1]),
3117 *(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3])));
3120 _mm_setr_pi32(*(
const int*)(tab +
idx[0]), *(
const int*)(tab +
idx[1])),
3121 _mm_setr_pi32(*(
const int*)(tab +
idx[2]), *(
const int*)(tab +
idx[3]))
3127 return v_int16x8(_mm_set_epi64x(*(
const int64_t*)(tab +
idx[1]), *(
const int64_t*)(tab +
idx[0])));
3135 #if defined(_MSC_VER)
3137 tab[
idx[2]], tab[
idx[3]]));
3140 _mm_setr_pi32(tab[
idx[0]], tab[
idx[1]]),
3141 _mm_setr_pi32(tab[
idx[2]], tab[
idx[3]])
3147 return v_int32x4(_mm_set_epi64x(*(
const int64_t*)(tab +
idx[1]), *(
const int64_t*)(tab +
idx[0])));
3151 return v_int32x4(_mm_loadu_si128((
const __m128i*)(tab +
idx[0])));
3163 return v_int64x2(_mm_loadu_si128((
const __m128i*)(tab +
idx[0])));
3165 inline v_uint64x2 v_lut(
const uint64_t* tab,
const int*
idx) {
return v_reinterpret_as_u64(
v_lut((
const int64_t *)tab,
idx)); }
3190 return v_reinterpret_as_u32(
v_lut((
const int *)tab, idxvec));
3217 __m128 z = _mm_setzero_ps();
3218 __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab +
idx[0]));
3219 __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab +
idx[2]));
3220 xy01 = _mm_loadh_pi(xy01, (__m64*)(tab +
idx[1]));
3221 xy23 = _mm_loadh_pi(xy23, (__m64*)(tab +
idx[3]));
3222 __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
3223 __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
3232 __m128d xy0 = _mm_loadu_pd(tab +
idx[0]);
3233 __m128d xy1 = _mm_loadu_pd(tab +
idx[1]);
3241 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200)));
3243 __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3244 a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0));
3245 a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
3246 return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3253 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400)));
3255 __m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3256 return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3264 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100)));
3266 __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3267 return v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)));
3274 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100)));
3276 return v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val)));
3283 return v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
3291 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
3293 __m128i
mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
3294 __m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(
mask, vec.val), _mm_and_si128(
mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
3295 return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3303 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100)));
3305 return v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3318 return (
uchar)_mm_extract_epi8(v.val, i);
3320 return v_rotate_right<i>(v).get0();
3327 return (
schar)v_extract_n<i>(v_reinterpret_as_u8(v));
3333 return (
ushort)_mm_extract_epi16(v.val, i);
3339 return (
short)v_extract_n<i>(v_reinterpret_as_u16(v));
3346 return (
uint)_mm_extract_epi32(v.val, i);
3348 return v_rotate_right<i>(v).get0();
3355 return (
int)v_extract_n<i>(v_reinterpret_as_u32(v));
3361 #ifdef CV__SIMD_NATIVE_mm_extract_epi64
3362 return (
uint64)_v128_extract_epi64<i>(v.val);
3364 return v_rotate_right<i>(v).get0();
3371 return (
int64)v_extract_n<i>(v_reinterpret_as_u64(v));
3377 union {
uint iv;
float fv; } d;
3378 d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
3385 union {
uint64 iv;
double dv; } d;
3386 d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
3393 return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3399 return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3405 return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((
char)i,(
char)i,(
char)i,(
char)i)));
3413 return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((
const __m128i*)ptr)));
3415 const __m128i z = _mm_setzero_si128(),
delta = _mm_set1_epi32(0x38000000);
3416 const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
3417 const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
3418 __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((
const __m128i*)ptr));
3419 __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
3420 __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3),
delta);
3421 __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
3423 t = _mm_add_epi32(t, _mm_and_si128(
delta, _mm_cmpeq_epi32(maxexp, e)));
3424 __m128i zmask = _mm_cmpeq_epi32(e, z);
3425 __m128i ft = v_select_si128(zmask, zt, t);
3426 return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
3433 __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
3434 _mm_storel_epi64((__m128i*)ptr, fp16_value);
3436 const __m128i signmask = _mm_set1_epi32(0x80000000);
3437 const __m128i rval = _mm_set1_epi32(0x3f000000);
3439 __m128i t = _mm_castps_si128(v.val);
3440 __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
3441 t = _mm_andnot_si128(signmask, t);
3443 __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
3444 __m128i
isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
3445 __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
3446 __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
3447 __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
3448 tt = _mm_sub_epi32(tt, rval);
3449 __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
3450 __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
3451 nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
3452 t = v_select_si128(tinymask, tt, nt);
3453 t = v_select_si128(finitemask, t, naninf);
3454 t = _mm_or_si128(t, sign);
3455 t = _mm_packs_epi32(t, t);
3456 _mm_storel_epi64((__m128i*)ptr, t);
3462 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition: intrin_cpp.hpp:1515
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition: intrin_cpp.hpp:2216
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition: intrin_cpp.hpp:2761
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition: intrin_cpp.hpp:1496
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract.
Definition: intrin_cpp.hpp:2371
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CvRect r
Definition: imgproc_c.h:984
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
StoreMode
Definition: intrin.hpp:100
@ STORE_ALIGNED_NOCACHE
Definition: intrin.hpp:103
@ STORE_ALIGNED
Definition: intrin.hpp:102
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437