EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_sse.hpp
1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5// By downloading, copying, installing or using the software you agree to this license.
6// If you do not agree to this license, do not download, install,
7// copy or use the software.
8//
9//
10// License Agreement
11// For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16// Copyright (C) 2015, Itseez Inc., all rights reserved.
17// Third party copyrights are property of their respective owners.
18//
19// Redistribution and use in source and binary forms, with or without modification,
20// are permitted provided that the following conditions are met:
21//
22// * Redistribution's of source code must retain the above copyright notice,
23// this list of conditions and the following disclaimer.
24//
25// * Redistribution's in binary form must reproduce the above copyright notice,
26// this list of conditions and the following disclaimer in the documentation
27// and/or other materials provided with the distribution.
28//
29// * The name of the copyright holders may not be used to endorse or promote products
30// derived from this software without specific prior written permission.
31//
32// This software is provided by the copyright holders and contributors "as is" and
33// any express or implied warranties, including, but not limited to, the implied
34// warranties of merchantability and fitness for a particular purpose are disclaimed.
35// In no event shall the Intel Corporation or contributors be liable for any direct,
36// indirect, incidental, special, exemplary, or consequential damages
37// (including, but not limited to, procurement of substitute goods or services;
38// loss of use, data, or profits; or business interruption) however caused
39// and on any theory of liability, whether in contract, strict liability,
40// or tort (including negligence or otherwise) arising in any way out of
41// the use of this software, even if advised of the possibility of such damage.
42//
43//M*/
44
45#ifndef OPENCV_HAL_SSE_HPP
46#define OPENCV_HAL_SSE_HPP
47
48#include <algorithm>
49#include "opencv2/core/utility.hpp"
50
51#define CV_SIMD128 1
52#define CV_SIMD128_64F 1
53#define CV_SIMD128_FP16 0 // no native operations with FP16 type.
54
55namespace cv
56{
57
59
60//
61// Compilation troubleshooting:
62// - MSVC: error C2719: 'a': formal parameter with requested alignment of 16 won't be aligned
63// Replace parameter declaration to const reference:
64// -v_int32x4 a
65// +const v_int32x4& a
66//
67
68CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
69
71
72struct v_uint8x16
73{
74 typedef uchar lane_type;
75 typedef __m128i vector_type;
76 enum { nlanes = 16 };
77
78 /* coverity[uninit_ctor]: suppress warning */
79 v_uint8x16() {}
80 explicit v_uint8x16(__m128i v) : val(v) {}
81 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
82 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
83 {
84 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
85 (char)v4, (char)v5, (char)v6, (char)v7,
86 (char)v8, (char)v9, (char)v10, (char)v11,
87 (char)v12, (char)v13, (char)v14, (char)v15);
88 }
89
90 uchar get0() const
91 {
92 return (uchar)_mm_cvtsi128_si32(val);
93 }
94
95 __m128i val;
96};
97
98struct v_int8x16
99{
100 typedef schar lane_type;
101 typedef __m128i vector_type;
102 enum { nlanes = 16 };
103
104 /* coverity[uninit_ctor]: suppress warning */
105 v_int8x16() {}
106 explicit v_int8x16(__m128i v) : val(v) {}
107 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
108 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
109 {
110 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
111 (char)v4, (char)v5, (char)v6, (char)v7,
112 (char)v8, (char)v9, (char)v10, (char)v11,
113 (char)v12, (char)v13, (char)v14, (char)v15);
114 }
115
116 schar get0() const
117 {
118 return (schar)_mm_cvtsi128_si32(val);
119 }
120
121 __m128i val;
122};
123
124struct v_uint16x8
125{
126 typedef ushort lane_type;
127 typedef __m128i vector_type;
128 enum { nlanes = 8 };
129
130 /* coverity[uninit_ctor]: suppress warning */
131 v_uint16x8() {}
132 explicit v_uint16x8(__m128i v) : val(v) {}
133 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
134 {
135 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
136 (short)v4, (short)v5, (short)v6, (short)v7);
137 }
138
139 ushort get0() const
140 {
141 return (ushort)_mm_cvtsi128_si32(val);
142 }
143
144 __m128i val;
145};
146
147struct v_int16x8
148{
149 typedef short lane_type;
150 typedef __m128i vector_type;
151 enum { nlanes = 8 };
152
153 /* coverity[uninit_ctor]: suppress warning */
154 v_int16x8() {}
155 explicit v_int16x8(__m128i v) : val(v) {}
156 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
157 {
158 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
159 (short)v4, (short)v5, (short)v6, (short)v7);
160 }
161
162 short get0() const
163 {
164 return (short)_mm_cvtsi128_si32(val);
165 }
166
167 __m128i val;
168};
169
170struct v_uint32x4
171{
172 typedef unsigned lane_type;
173 typedef __m128i vector_type;
174 enum { nlanes = 4 };
175
176 /* coverity[uninit_ctor]: suppress warning */
177 v_uint32x4() {}
178 explicit v_uint32x4(__m128i v) : val(v) {}
179 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
180 {
181 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
182 }
183
184 unsigned get0() const
185 {
186 return (unsigned)_mm_cvtsi128_si32(val);
187 }
188
189 __m128i val;
190};
191
192struct v_int32x4
193{
194 typedef int lane_type;
195 typedef __m128i vector_type;
196 enum { nlanes = 4 };
197
198 /* coverity[uninit_ctor]: suppress warning */
199 v_int32x4() {}
200 explicit v_int32x4(__m128i v) : val(v) {}
201 v_int32x4(int v0, int v1, int v2, int v3)
202 {
203 val = _mm_setr_epi32(v0, v1, v2, v3);
204 }
205
206 int get0() const
207 {
208 return _mm_cvtsi128_si32(val);
209 }
210
211 __m128i val;
212};
213
214struct v_float32x4
215{
216 typedef float lane_type;
217 typedef __m128 vector_type;
218 enum { nlanes = 4 };
219
220 /* coverity[uninit_ctor]: suppress warning */
221 v_float32x4() {}
222 explicit v_float32x4(__m128 v) : val(v) {}
223 v_float32x4(float v0, float v1, float v2, float v3)
224 {
225 val = _mm_setr_ps(v0, v1, v2, v3);
226 }
227
228 float get0() const
229 {
230 return _mm_cvtss_f32(val);
231 }
232
233 __m128 val;
234};
235
236struct v_uint64x2
237{
238 typedef uint64 lane_type;
239 typedef __m128i vector_type;
240 enum { nlanes = 2 };
241
242 /* coverity[uninit_ctor]: suppress warning */
243 v_uint64x2() {}
244 explicit v_uint64x2(__m128i v) : val(v) {}
245 v_uint64x2(uint64 v0, uint64 v1)
246 {
247#if defined(_MSC_VER) && _MSC_VER >= 1920/*MSVS 2019*/ && defined(_M_X64) && !defined(__clang__)
248 val = _mm_setr_epi64x((int64_t)v0, (int64_t)v1);
249#elif defined(__GNUC__)
250 val = _mm_setr_epi64((__m64)v0, (__m64)v1);
251#else
252 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
253#endif
254 }
255
256 uint64 get0() const
257 {
258 #if !defined(__x86_64__) && !defined(_M_X64)
259 int a = _mm_cvtsi128_si32(val);
260 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
261 return (unsigned)a | ((uint64)(unsigned)b << 32);
262 #else
263 return (uint64)_mm_cvtsi128_si64(val);
264 #endif
265 }
266
267 __m128i val;
268};
269
270struct v_int64x2
271{
272 typedef int64 lane_type;
273 typedef __m128i vector_type;
274 enum { nlanes = 2 };
275
276 /* coverity[uninit_ctor]: suppress warning */
277 v_int64x2() {}
278 explicit v_int64x2(__m128i v) : val(v) {}
279 v_int64x2(int64 v0, int64 v1)
280 {
281#if defined(_MSC_VER) && _MSC_VER >= 1920/*MSVS 2019*/ && defined(_M_X64) && !defined(__clang__)
282 val = _mm_setr_epi64x((int64_t)v0, (int64_t)v1);
283#elif defined(__GNUC__)
284 val = _mm_setr_epi64((__m64)v0, (__m64)v1);
285#else
286 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
287#endif
288 }
289
290 int64 get0() const
291 {
292 #if !defined(__x86_64__) && !defined(_M_X64)
293 int a = _mm_cvtsi128_si32(val);
294 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
295 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
296 #else
297 return _mm_cvtsi128_si64(val);
298 #endif
299 }
300
301 __m128i val;
302};
303
304struct v_float64x2
305{
306 typedef double lane_type;
307 typedef __m128d vector_type;
308 enum { nlanes = 2 };
309
310 /* coverity[uninit_ctor]: suppress warning */
311 v_float64x2() {}
312 explicit v_float64x2(__m128d v) : val(v) {}
313 v_float64x2(double v0, double v1)
314 {
315 val = _mm_setr_pd(v0, v1);
316 }
317
318 double get0() const
319 {
320 return _mm_cvtsd_f64(val);
321 }
322
323 __m128d val;
324};
325
326namespace hal_sse_internal
327{
328 template <typename to_sse_type, typename from_sse_type>
329 to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
330
331#define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
332 template<> inline \
333 to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
334 { return sse_cast_intrin(a); }
335
336 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
337 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
338 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
339 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
340 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
341 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
342 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
343 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
344 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
345}
346
347#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
348inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
349inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
350template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
351{ return _Tpvec(cast(a.val)); }
352
353OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
354OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
355OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
356OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
357OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
358OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
359OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
360OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
361
362inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
363inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
364inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
365inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
366
367template<typename _Tpvec> inline
368v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
369template<typename _Tpvec> inline
370v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
371inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
372{ return v_float32x4(_mm_castsi128_ps(a.val)); }
373inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
374{ return v_float32x4(_mm_castsi128_ps(a.val)); }
375inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
376{ return v_float64x2(_mm_castsi128_pd(a.val)); }
377inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
378{ return v_float64x2(_mm_castsi128_pd(a.val)); }
379
380#define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
381inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
382{ return _Tpvec(_mm_castps_si128(a.val)); } \
383inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
384{ return _Tpvec(_mm_castpd_si128(a.val)); }
385
386OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
387OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
388OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
389OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
390OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
391OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
392OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
393OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
394
395inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
396inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
397inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
398inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
399
401inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
402{
403 __m128i delta = _mm_set1_epi16(255);
404 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
405 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
406}
407
408inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
409{
410 __m128i delta = _mm_set1_epi16(255);
411 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
412 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
413}
414
415inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
416{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
417
418inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
419{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
420
421template<int n> inline
422v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
423{
424 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
425 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
426 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
427 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
428}
429
430template<int n> inline
431void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
432{
433 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
434 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
435 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
436}
437
438template<int n> inline
439v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
440{
441 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
442 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
443 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
444}
445
446template<int n> inline
447void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
448{
449 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
450 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
451 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
452}
453
454inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
455{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
456
457inline void v_pack_store(schar* ptr, const v_int16x8& a)
458{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
459
460template<int n> inline
461v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
462{
463 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
464 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
465 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
466 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
467}
468template<int n> inline
469void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
470{
471 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
472 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
473 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
474 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
475}
476
477
478// byte-wise "mask ? a : b"
479inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
480{
481#if CV_SSE4_1
482 return _mm_blendv_epi8(b, a, mask);
483#else
484 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
485#endif
486}
487
488inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
489{ return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
490
491inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
492{
493 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
494 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
495 __m128i r = _mm_packs_epi32(a1, a1);
496 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
497}
498
499template<int n> inline
500v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
501{
502 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
503 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
504 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
505 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
506}
507
508template<int n> inline
509void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
510{
511 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
512 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
513 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
514 _mm_storel_epi64((__m128i*)ptr, a2);
515}
516
517inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
518{
519#if CV_SSE4_1
520 return v_uint16x8(_mm_packus_epi32(a.val, b.val));
521#else
522 __m128i delta32 = _mm_set1_epi32(32768);
523
524 // preliminary saturate negative values to zero
525 __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
526 __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
527
528 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
529 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
530#endif
531}
532
533inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
534{
535#if CV_SSE4_1
536 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val));
537#else
538 __m128i delta32 = _mm_set1_epi32(32768);
539 __m128i a1 = _mm_sub_epi32(a.val, delta32);
540 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
541 _mm_storel_epi64((__m128i*)ptr, r);
542#endif
543}
544
545template<int n> inline
546v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
547{
548#if CV_SSE4_1
549 __m128i delta = _mm_set1_epi32(1 << (n - 1));
550 return v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
551 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
552#else
553 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
554 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
555 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
556 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
557 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
558 return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
559#endif
560}
561
562template<int n> inline
563void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
564{
565#if CV_SSE4_1
566 __m128i delta = _mm_set1_epi32(1 << (n - 1));
567 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
568 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1));
569#else
570 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
571 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
572 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
573 _mm_storel_epi64((__m128i*)ptr, a2);
574#endif
575}
576
577inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
578{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
579
580inline void v_pack_store(short* ptr, const v_int32x4& a)
581{
582 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
583}
584
585template<int n> inline
586v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
587{
588 __m128i delta = _mm_set1_epi32(1 << (n-1));
589 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
590 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
591}
592
593template<int n> inline
594void v_rshr_pack_store(short* ptr, const v_int32x4& a)
595{
596 __m128i delta = _mm_set1_epi32(1 << (n-1));
597 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
598 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
599}
600
601
602// [a0 0 | b0 0] [a1 0 | b1 0]
603inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
604{
605 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
606 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
607 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
608}
609
610inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
611{
612 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
613 _mm_storel_epi64((__m128i*)ptr, a1);
614}
615
616// [a0 0 | b0 0] [a1 0 | b1 0]
617inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
618{
619 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
620 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
621 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
622}
623
624inline void v_pack_store(int* ptr, const v_int64x2& a)
625{
626 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
627 _mm_storel_epi64((__m128i*)ptr, a1);
628}
629
630template<int n> inline
631v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
632{
633 uint64 delta = (uint64)1 << (n-1);
634 v_uint64x2 delta2(delta, delta);
635 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
636 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
637 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
638 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
639 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
640}
641
642template<int n> inline
643void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
644{
645 uint64 delta = (uint64)1 << (n-1);
646 v_uint64x2 delta2(delta, delta);
647 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
648 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
649 _mm_storel_epi64((__m128i*)ptr, a2);
650}
651
652inline __m128i v_sign_epi64(__m128i a)
653{
654 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
655}
656
657inline __m128i v_srai_epi64(__m128i a, int imm)
658{
659 __m128i smask = v_sign_epi64(a);
660 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
661}
662
663template<int n> inline
664v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
665{
666 int64 delta = (int64)1 << (n-1);
667 v_int64x2 delta2(delta, delta);
668 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
669 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
670 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
671 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
672 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
673}
674
675template<int n> inline
676void v_rshr_pack_store(int* ptr, const v_int64x2& a)
677{
678 int64 delta = (int64)1 << (n-1);
679 v_int64x2 delta2(delta, delta);
680 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
681 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
682 _mm_storel_epi64((__m128i*)ptr, a2);
683}
684
685// pack boolean
686inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
687{
688 __m128i ab = _mm_packs_epi16(a.val, b.val);
689 return v_uint8x16(ab);
690}
691
692inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
693 const v_uint32x4& c, const v_uint32x4& d)
694{
695 __m128i ab = _mm_packs_epi32(a.val, b.val);
696 __m128i cd = _mm_packs_epi32(c.val, d.val);
697 return v_uint8x16(_mm_packs_epi16(ab, cd));
698}
699
700inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
701 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
702 const v_uint64x2& g, const v_uint64x2& h)
703{
704 __m128i ab = _mm_packs_epi32(a.val, b.val);
705 __m128i cd = _mm_packs_epi32(c.val, d.val);
706 __m128i ef = _mm_packs_epi32(e.val, f.val);
707 __m128i gh = _mm_packs_epi32(g.val, h.val);
708
709 __m128i abcd = _mm_packs_epi32(ab, cd);
710 __m128i efgh = _mm_packs_epi32(ef, gh);
711 return v_uint8x16(_mm_packs_epi16(abcd, efgh));
712}
713
714inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
715 const v_float32x4& m1, const v_float32x4& m2,
716 const v_float32x4& m3)
717{
718 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
719 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
720 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
721 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
722
723 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
724}
725
726inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
727 const v_float32x4& m1, const v_float32x4& m2,
728 const v_float32x4& a)
729{
730 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
731 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
732 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
733
734 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
735}
736
737#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
738 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
739 { \
740 return _Tpvec(intrin(a.val, b.val)); \
741 } \
742 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
743 { \
744 a.val = intrin(a.val, b.val); \
745 return a; \
746 }
747
748OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
749OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
750OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
751OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
752OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
753OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
754OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
755OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
756OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
757OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
758OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
759OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
760OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
761OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
762OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
763OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
764OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
765OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
766OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
767OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
768OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
769OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
770OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
771OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
772OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
773OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
774
775// saturating multiply 8-bit, 16-bit
776#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
777 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
778 { \
779 _Tpwvec c, d; \
780 v_mul_expand(a, b, c, d); \
781 return v_pack(c, d); \
782 } \
783 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
784 { a = a * b; return a; }
785
786OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
787OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16, v_int16x8)
788OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
789OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8, v_int32x4)
790
791// Multiply and expand
792inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
793 v_uint16x8& c, v_uint16x8& d)
794{
795 v_uint16x8 a0, a1, b0, b1;
796 v_expand(a, a0, a1);
797 v_expand(b, b0, b1);
798 c = v_mul_wrap(a0, b0);
799 d = v_mul_wrap(a1, b1);
800}
801
802inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
803 v_int16x8& c, v_int16x8& d)
804{
805 v_int16x8 a0, a1, b0, b1;
806 v_expand(a, a0, a1);
807 v_expand(b, b0, b1);
808 c = v_mul_wrap(a0, b0);
809 d = v_mul_wrap(a1, b1);
810}
811
812inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
813 v_int32x4& c, v_int32x4& d)
814{
815 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
816 __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
817 c.val = _mm_unpacklo_epi16(v0, v1);
818 d.val = _mm_unpackhi_epi16(v0, v1);
819}
820
821inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
822 v_uint32x4& c, v_uint32x4& d)
823{
824 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
825 __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
826 c.val = _mm_unpacklo_epi16(v0, v1);
827 d.val = _mm_unpackhi_epi16(v0, v1);
828}
829
830inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
831 v_uint64x2& c, v_uint64x2& d)
832{
833 __m128i c0 = _mm_mul_epu32(a.val, b.val);
834 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
835 c.val = _mm_unpacklo_epi64(c0, c1);
836 d.val = _mm_unpackhi_epi64(c0, c1);
837}
838
839inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
840inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
841
843
844// 16 >> 32
845inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
846{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
847inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
848{ return v_dotprod(a, b) + c; }
849
850// 32 >> 64
851inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
852{
853#if CV_SSE4_1
854 __m128i even = _mm_mul_epi32(a.val, b.val);
855 __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
856 return v_int64x2(_mm_add_epi64(even, odd));
857#else
858 __m128i even_u = _mm_mul_epu32(a.val, b.val);
859 __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
860 // convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
861 __m128i a_sign = _mm_srai_epi32(a.val, 31);
862 __m128i b_sign = _mm_srai_epi32(b.val, 31);
863 // |x * sign of x
864 __m128i axb = _mm_and_si128(a.val, b_sign);
865 __m128i bxa = _mm_and_si128(b.val, a_sign);
866 // sum of sign corrections
867 __m128i ssum = _mm_add_epi32(bxa, axb);
868 __m128i even_ssum = _mm_slli_epi64(ssum, 32);
869 __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
870 // convert to signed and prod
871 return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
872#endif
873}
874inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
875{ return v_dotprod(a, b) + c; }
876
877// 8 >> 32
878inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
879{
880 __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
881 __m128i a1 = _mm_srli_epi16(a.val, 8); // odd
882 __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
883 __m128i b1 = _mm_srli_epi16(b.val, 8);
884 __m128i p0 = _mm_madd_epi16(a0, b0);
885 __m128i p1 = _mm_madd_epi16(a1, b1);
886 return v_uint32x4(_mm_add_epi32(p0, p1));
887}
888inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
889{ return v_dotprod_expand(a, b) + c; }
890
891inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
892{
893 __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
894 __m128i a1 = _mm_srai_epi16(a.val, 8); // odd
895 __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
896 __m128i b1 = _mm_srai_epi16(b.val, 8);
897 __m128i p0 = _mm_madd_epi16(a0, b0);
898 __m128i p1 = _mm_madd_epi16(a1, b1);
899 return v_int32x4(_mm_add_epi32(p0, p1));
900}
901inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
902{ return v_dotprod_expand(a, b) + c; }
903
904// 16 >> 64
905inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
906{
907 v_uint32x4 c, d;
908 v_mul_expand(a, b, c, d);
909
910 v_uint64x2 c0, c1, d0, d1;
911 v_expand(c, c0, c1);
912 v_expand(d, d0, d1);
913
914 c0 += c1; d0 += d1;
915 return v_uint64x2(_mm_add_epi64(
916 _mm_unpacklo_epi64(c0.val, d0.val),
917 _mm_unpackhi_epi64(c0.val, d0.val)
918 ));
919}
920inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
921{ return v_dotprod_expand(a, b) + c; }
922
923inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
924{
925 v_int32x4 prod = v_dotprod(a, b);
926 v_int64x2 c, d;
927 v_expand(prod, c, d);
928 return v_int64x2(_mm_add_epi64(
929 _mm_unpacklo_epi64(c.val, d.val),
930 _mm_unpackhi_epi64(c.val, d.val)
931 ));
932}
933inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
934{ return v_dotprod_expand(a, b) + c; }
935
936// 32 >> 64f
937inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
938{
939#if CV_SSE4_1
940 return v_cvt_f64(v_dotprod(a, b));
941#else
942 v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
944
945 return v_float64x2(_mm_add_pd(
946 _mm_unpacklo_pd(c.val, d.val),
947 _mm_unpackhi_pd(c.val, d.val)
948 ));
949#endif
950}
951inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
952{ return v_dotprod_expand(a, b) + c; }
953
955
956// 16 >> 32
957inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
958{ return v_dotprod(a, b); }
959inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
960{ return v_dotprod(a, b) + c; }
961
962// 32 >> 64
963inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
964{ return v_dotprod(a, b); }
965inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
966{ return v_dotprod_fast(a, b) + c; }
967
968// 8 >> 32
969inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
970{
971 __m128i a0 = v_expand_low(a).val;
972 __m128i a1 = v_expand_high(a).val;
973 __m128i b0 = v_expand_low(b).val;
974 __m128i b1 = v_expand_high(b).val;
975 __m128i p0 = _mm_madd_epi16(a0, b0);
976 __m128i p1 = _mm_madd_epi16(a1, b1);
977 return v_uint32x4(_mm_add_epi32(p0, p1));
978}
979inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
980{ return v_dotprod_expand_fast(a, b) + c; }
981
982inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
983{
984#if CV_SSE4_1
985 __m128i a0 = _mm_cvtepi8_epi16(a.val);
986 __m128i a1 = v_expand_high(a).val;
987 __m128i b0 = _mm_cvtepi8_epi16(b.val);
988 __m128i b1 = v_expand_high(b).val;
989 __m128i p0 = _mm_madd_epi16(a0, b0);
990 __m128i p1 = _mm_madd_epi16(a1, b1);
991 return v_int32x4(_mm_add_epi32(p0, p1));
992#else
993 return v_dotprod_expand(a, b);
994#endif
995}
996inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
997{ return v_dotprod_expand_fast(a, b) + c; }
998
999// 16 >> 64
1000inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1001{
1002 v_uint32x4 c, d;
1003 v_mul_expand(a, b, c, d);
1004
1005 v_uint64x2 c0, c1, d0, d1;
1006 v_expand(c, c0, c1);
1007 v_expand(d, d0, d1);
1008
1009 c0 += c1; d0 += d1;
1010 return c0 + d0;
1011}
1012inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1013{ return v_dotprod_expand_fast(a, b) + c; }
1014
1015inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1016{
1017 v_int32x4 prod = v_dotprod(a, b);
1018 v_int64x2 c, d;
1019 v_expand(prod, c, d);
1020 return c + d;
1021}
1022inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1023{ return v_dotprod_expand_fast(a, b) + c; }
1024
1025// 32 >> 64f
1026v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
1027inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1028{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
1029inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1030{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
1031
1032#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
1033 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
1034 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
1035 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
1036 inline _Tpvec operator ~ (const _Tpvec& a) \
1037 { \
1038 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
1039 }
1040
1041OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
1042OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
1043OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
1044OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
1045OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
1046OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
1047OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
1048OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
1049OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
1050OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
1051
1052inline v_float32x4 v_sqrt(const v_float32x4& x)
1053{ return v_float32x4(_mm_sqrt_ps(x.val)); }
1054
1055inline v_float32x4 v_invsqrt(const v_float32x4& x)
1056{
1057 const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
1058 __m128 t = x.val;
1059 __m128 h = _mm_mul_ps(t, _0_5);
1060 t = _mm_rsqrt_ps(t);
1061 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
1062 return v_float32x4(t);
1063}
1064
1065inline v_float64x2 v_sqrt(const v_float64x2& x)
1066{ return v_float64x2(_mm_sqrt_pd(x.val)); }
1067
1068inline v_float64x2 v_invsqrt(const v_float64x2& x)
1069{
1070 const __m128d v_1 = _mm_set1_pd(1.);
1071 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
1072}
1073
1074#define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
1075inline _Tpuvec v_abs(const _Tpsvec& x) \
1076{ return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
1077
1078OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
1079OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
1080inline v_uint32x4 v_abs(const v_int32x4& x)
1081{
1082 __m128i s = _mm_srli_epi32(x.val, 31);
1083 __m128i f = _mm_srai_epi32(x.val, 31);
1084 return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
1085}
1086inline v_float32x4 v_abs(const v_float32x4& x)
1087{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
1088inline v_float64x2 v_abs(const v_float64x2& x)
1089{
1090 return v_float64x2(_mm_and_pd(x.val,
1091 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
1092}
1093
1094// TODO: exp, log, sin, cos
1095
1096#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
1097inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1098{ \
1099 return _Tpvec(intrin(a.val, b.val)); \
1100}
1101
1102OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
1103OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
1104OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
1105OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
1106OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
1107OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
1108OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
1109OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
1110
1111inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
1112{
1113#if CV_SSE4_1
1114 return v_int8x16(_mm_min_epi8(a.val, b.val));
1115#else
1116 __m128i delta = _mm_set1_epi8((char)-128);
1117 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
1118 _mm_xor_si128(b.val, delta))));
1119#endif
1120}
1121inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
1122{
1123#if CV_SSE4_1
1124 return v_int8x16(_mm_max_epi8(a.val, b.val));
1125#else
1126 __m128i delta = _mm_set1_epi8((char)-128);
1127 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
1128 _mm_xor_si128(b.val, delta))));
1129#endif
1130}
1131inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
1132{
1133#if CV_SSE4_1
1134 return v_uint16x8(_mm_min_epu16(a.val, b.val));
1135#else
1136 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
1137#endif
1138}
1139inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
1140{
1141#if CV_SSE4_1
1142 return v_uint16x8(_mm_max_epu16(a.val, b.val));
1143#else
1144 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
1145#endif
1146}
1147inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
1148{
1149#if CV_SSE4_1
1150 return v_uint32x4(_mm_min_epu32(a.val, b.val));
1151#else
1152 __m128i delta = _mm_set1_epi32((int)0x80000000);
1153 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1154 return v_uint32x4(v_select_si128(mask, b.val, a.val));
1155#endif
1156}
1157inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
1158{
1159#if CV_SSE4_1
1160 return v_uint32x4(_mm_max_epu32(a.val, b.val));
1161#else
1162 __m128i delta = _mm_set1_epi32((int)0x80000000);
1163 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
1164 return v_uint32x4(v_select_si128(mask, a.val, b.val));
1165#endif
1166}
1167inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
1168{
1169#if CV_SSE4_1
1170 return v_int32x4(_mm_min_epi32(a.val, b.val));
1171#else
1172 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
1173#endif
1174}
1175inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
1176{
1177#if CV_SSE4_1
1178 return v_int32x4(_mm_max_epi32(a.val, b.val));
1179#else
1180 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
1181#endif
1182}
1183
1184#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
1185inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
1186{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1187inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
1188{ \
1189 __m128i not_mask = _mm_set1_epi32(-1); \
1190 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1191} \
1192inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
1193{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1194inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
1195{ \
1196 __m128i not_mask = _mm_set1_epi32(-1); \
1197 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
1198} \
1199inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
1200{ \
1201 __m128i smask = _mm_set1_##suffix(sbit); \
1202 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
1203} \
1204inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
1205{ \
1206 __m128i smask = _mm_set1_##suffix(sbit); \
1207 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
1208} \
1209inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
1210{ \
1211 __m128i smask = _mm_set1_##suffix(sbit); \
1212 __m128i not_mask = _mm_set1_epi32(-1); \
1213 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
1214 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1215} \
1216inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
1217{ \
1218 __m128i smask = _mm_set1_##suffix(sbit); \
1219 __m128i not_mask = _mm_set1_epi32(-1); \
1220 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
1221 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
1222} \
1223inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
1224{ \
1225 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
1226} \
1227inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
1228{ \
1229 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
1230} \
1231inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
1232{ \
1233 __m128i not_mask = _mm_set1_epi32(-1); \
1234 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
1235} \
1236inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
1237{ \
1238 __m128i not_mask = _mm_set1_epi32(-1); \
1239 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
1240}
1241
1242OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
1243OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
1244OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
1245
1246#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
1247inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1248{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
1249inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1250{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
1251inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1252{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
1253inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1254{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
1255inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1256{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
1257inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1258{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
1259
1260OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
1261OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
1262
1263#if CV_SSE4_1
1264#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1265inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1266{ return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
1267inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1268{ return ~(a == b); }
1269#else
1270#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
1271inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1272{ __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
1273 return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
1274inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1275{ return ~(a == b); }
1276#endif
1277
1278OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
1279OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
1280
1281inline v_float32x4 v_not_nan(const v_float32x4& a)
1282{ return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
1283inline v_float64x2 v_not_nan(const v_float64x2& a)
1284{ return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
1285
1286OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
1287OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
1288OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
1289OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
1290OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
1291OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
1292OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
1293OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
1294OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
1295OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16)
1296
1297inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
1298{
1299 __m128i ad = _mm_srai_epi16(a.val, 8);
1300 __m128i bd = _mm_srai_epi16(b.val, 8);
1301 __m128i p0 = _mm_mullo_epi16(a.val, b.val); // even
1302 __m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd
1303 const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
1304 return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
1305}
1306inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
1307{
1308 return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
1309}
1310
1313inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1314{ return v_add_wrap(a - b, b - a); }
1315inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1316{ return v_add_wrap(a - b, b - a); }
1317inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1318{ return v_max(a, b) - v_min(a, b); }
1319
1320inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1321{
1322 v_int8x16 d = v_sub_wrap(a, b);
1323 v_int8x16 m = a < b;
1324 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1325}
1326inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1327{
1328 return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1329}
1330inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1331{
1332 v_int32x4 d = a - b;
1333 v_int32x4 m = a < b;
1334 return v_reinterpret_as_u32((d ^ m) - m);
1335}
1336
1338inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1339{
1340 v_int8x16 d = a - b;
1341 v_int8x16 m = a < b;
1342 return (d ^ m) - m;
1343 }
1344inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1345{ return v_max(a, b) - v_min(a, b); }
1346
1347
1348inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1349{
1350 return a * b + c;
1351}
1352
1353inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1354{
1355 return v_fma(a, b, c);
1356}
1357
1358inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1359{
1360#if CV_FMA3
1361 return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
1362#else
1363 return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
1364#endif
1365}
1366
1367inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1368{
1369#if CV_FMA3
1370 return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
1371#else
1372 return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
1373#endif
1374}
1375
1376#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
1377inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1378{ \
1379 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
1380 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
1381} \
1382inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1383{ \
1384 _Tpvec res = v_fma(a, a, b*b); \
1385 return _Tpvec(_mm_sqrt_##suffix(res.val)); \
1386} \
1387inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1388{ \
1389 return v_fma(a, a, b*b); \
1390} \
1391inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1392{ \
1393 return v_fma(a, b, c); \
1394}
1395
1396OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
1397OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
1398
1399#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
1400inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1401{ \
1402 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1403} \
1404inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1405{ \
1406 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1407} \
1408inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1409{ \
1410 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1411} \
1412inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1413{ \
1414 return _Tpsvec(srai(a.val, imm)); \
1415} \
1416template<int imm> \
1417inline _Tpuvec v_shl(const _Tpuvec& a) \
1418{ \
1419 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1420} \
1421template<int imm> \
1422inline _Tpsvec v_shl(const _Tpsvec& a) \
1423{ \
1424 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1425} \
1426template<int imm> \
1427inline _Tpuvec v_shr(const _Tpuvec& a) \
1428{ \
1429 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1430} \
1431template<int imm> \
1432inline _Tpsvec v_shr(const _Tpsvec& a) \
1433{ \
1434 return _Tpsvec(srai(a.val, imm)); \
1435}
1436
1437OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
1438OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
1439OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
1440
1441namespace hal_sse_internal
1442{
1443 template <int imm,
1444 bool is_invalid = ((imm < 0) || (imm > 16)),
1445 bool is_first = (imm == 0),
1446 bool is_half = (imm == 8),
1447 bool is_second = (imm == 16),
1448 bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
1449 class v_sse_palignr_u8_class;
1450
1451 template <int imm>
1452 class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
1453
1454 template <int imm>
1455 class v_sse_palignr_u8_class<imm, false, true, false, false, false>
1456 {
1457 public:
1458 inline __m128i operator()(const __m128i& a, const __m128i&) const
1459 {
1460 return a;
1461 }
1462 };
1463
1464 template <int imm>
1465 class v_sse_palignr_u8_class<imm, false, false, true, false, false>
1466 {
1467 public:
1468 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1469 {
1470 return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
1471 }
1472 };
1473
1474 template <int imm>
1475 class v_sse_palignr_u8_class<imm, false, false, false, true, false>
1476 {
1477 public:
1478 inline __m128i operator()(const __m128i&, const __m128i& b) const
1479 {
1480 return b;
1481 }
1482 };
1483
1484 template <int imm>
1485 class v_sse_palignr_u8_class<imm, false, false, false, false, true>
1486 {
1487#if CV_SSSE3
1488 public:
1489 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1490 {
1491 return _mm_alignr_epi8(b, a, imm);
1492 }
1493#else
1494 public:
1495 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1496 {
1497 enum { imm2 = (sizeof(__m128i) - imm) };
1498 return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
1499 }
1500#endif
1501 };
1502
1503 template <int imm>
1504 inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
1505 {
1506 CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
1507 return v_sse_palignr_u8_class<imm>()(a, b);
1508 }
1509}
1510
1511template<int imm, typename _Tpvec>
1512inline _Tpvec v_rotate_right(const _Tpvec &a)
1513{
1514 using namespace hal_sse_internal;
1515 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1516 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1517 _mm_srli_si128(
1518 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1519}
1520
1521template<int imm, typename _Tpvec>
1522inline _Tpvec v_rotate_left(const _Tpvec &a)
1523{
1524 using namespace hal_sse_internal;
1525 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1526 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1527 _mm_slli_si128(
1528 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1529}
1530
1531template<int imm, typename _Tpvec>
1532inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1533{
1534 using namespace hal_sse_internal;
1535 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1536 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1537 v_sse_palignr_u8<imm2>(
1538 v_sse_reinterpret_as<__m128i>(a.val),
1539 v_sse_reinterpret_as<__m128i>(b.val))));
1540}
1541
1542template<int imm, typename _Tpvec>
1543inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1544{
1545 using namespace hal_sse_internal;
1546 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1547 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1548 v_sse_palignr_u8<imm2>(
1549 v_sse_reinterpret_as<__m128i>(b.val),
1550 v_sse_reinterpret_as<__m128i>(a.val))));
1551}
1552
1553#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1554inline _Tpvec v_load(const _Tp* ptr) \
1555{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1556inline _Tpvec v_load_aligned(const _Tp* ptr) \
1557{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1558inline _Tpvec v_load_low(const _Tp* ptr) \
1559{ return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1560inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1561{ \
1562 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1563 _mm_loadl_epi64((const __m128i*)ptr1))); \
1564} \
1565inline void v_store(_Tp* ptr, const _Tpvec& a) \
1566{ _mm_storeu_si128((__m128i*)ptr, a.val); } \
1567inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1568{ _mm_store_si128((__m128i*)ptr, a.val); } \
1569inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1570{ _mm_stream_si128((__m128i*)ptr, a.val); } \
1571inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1572{ \
1573 if( mode == hal::STORE_UNALIGNED ) \
1574 _mm_storeu_si128((__m128i*)ptr, a.val); \
1575 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1576 _mm_stream_si128((__m128i*)ptr, a.val); \
1577 else \
1578 _mm_store_si128((__m128i*)ptr, a.val); \
1579} \
1580inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1581{ _mm_storel_epi64((__m128i*)ptr, a.val); } \
1582inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1583{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1584
1585OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
1586OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
1587OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
1588OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
1589OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1590OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
1591OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
1592OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
1593
1594#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1595inline _Tpvec v_load(const _Tp* ptr) \
1596{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1597inline _Tpvec v_load_aligned(const _Tp* ptr) \
1598{ return _Tpvec(_mm_load_##suffix(ptr)); } \
1599inline _Tpvec v_load_low(const _Tp* ptr) \
1600{ return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1601inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1602{ \
1603 return _Tpvec(_mm_castsi128_##suffix( \
1604 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1605 _mm_loadl_epi64((const __m128i*)ptr1)))); \
1606} \
1607inline void v_store(_Tp* ptr, const _Tpvec& a) \
1608{ _mm_storeu_##suffix(ptr, a.val); } \
1609inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1610{ _mm_store_##suffix(ptr, a.val); } \
1611inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1612{ _mm_stream_##suffix(ptr, a.val); } \
1613inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1614{ \
1615 if( mode == hal::STORE_UNALIGNED ) \
1616 _mm_storeu_##suffix(ptr, a.val); \
1617 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1618 _mm_stream_##suffix(ptr, a.val); \
1619 else \
1620 _mm_store_##suffix(ptr, a.val); \
1621} \
1622inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1623{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1624inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1625{ \
1626 __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1627 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1628}
1629
1630OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
1631OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
1632
1633inline unsigned v_reduce_sum(const v_uint8x16& a)
1634{
1635 __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
1636 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1637}
1638inline int v_reduce_sum(const v_int8x16& a)
1639{
1640 __m128i half = _mm_set1_epi8((schar)-128);
1641 half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
1642 return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
1643}
1644#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
1645inline schar v_reduce_##func(const v_int8x16& a) \
1646{ \
1647 __m128i val = a.val; \
1648 __m128i smask = _mm_set1_epi8((schar)-128); \
1649 val = _mm_xor_si128(val, smask); \
1650 val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1651 val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1652 val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1653 val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1654 return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
1655} \
1656inline uchar v_reduce_##func(const v_uint8x16& a) \
1657{ \
1658 __m128i val = a.val; \
1659 val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
1660 val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
1661 val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
1662 val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
1663 return (uchar)_mm_cvtsi128_si32(val); \
1664}
1665OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
1666OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
1667
1668#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1669inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1670{ \
1671 __m128i val = a.val; \
1672 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1673 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1674 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1675 return (scalartype)_mm_cvtsi128_si32(val); \
1676} \
1677inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1678{ \
1679 __m128i val = a.val; \
1680 __m128i smask = _mm_set1_epi16(sbit); \
1681 val = _mm_xor_si128(val, smask); \
1682 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1683 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1684 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1685 return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
1686}
1687OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
1688OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
1689
1690#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1691inline scalartype v_reduce_sum(const _Tpvec& a) \
1692{ \
1693 regtype val = a.val; \
1694 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1695 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1696 return (scalartype)_mm_cvt##extract(val); \
1697}
1698
1699#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1700inline scalartype v_reduce_##func(const _Tpvec& a) \
1701{ \
1702 scalartype CV_DECL_ALIGNED(16) buf[4]; \
1703 v_store_aligned(buf, a); \
1704 scalartype s0 = scalar_func(buf[0], buf[1]); \
1705 scalartype s1 = scalar_func(buf[2], buf[3]); \
1706 return scalar_func(s0, s1); \
1707}
1708
1709OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1710OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1711OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1712
1713inline int v_reduce_sum(const v_int16x8& a)
1714{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1715inline unsigned v_reduce_sum(const v_uint16x8& a)
1716{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
1717
1718inline uint64 v_reduce_sum(const v_uint64x2& a)
1719{
1720 uint64 CV_DECL_ALIGNED(32) idx[2];
1721 v_store_aligned(idx, a);
1722 return idx[0] + idx[1];
1723}
1724inline int64 v_reduce_sum(const v_int64x2& a)
1725{
1726 int64 CV_DECL_ALIGNED(32) idx[2];
1727 v_store_aligned(idx, a);
1728 return idx[0] + idx[1];
1729}
1730inline double v_reduce_sum(const v_float64x2& a)
1731{
1732 double CV_DECL_ALIGNED(32) idx[2];
1733 v_store_aligned(idx, a);
1734 return idx[0] + idx[1];
1735}
1736
1737inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1738 const v_float32x4& c, const v_float32x4& d)
1739{
1740#if CV_SSE3
1741 __m128 ab = _mm_hadd_ps(a.val, b.val);
1742 __m128 cd = _mm_hadd_ps(c.val, d.val);
1743 return v_float32x4(_mm_hadd_ps(ab, cd));
1744#else
1745 __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1746 __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1747 return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1748#endif
1749}
1750
1751OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1752OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1753OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1754OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1755OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1756OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1757
1758inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1759{
1760 __m128i half = _mm_sad_epu8(a.val, b.val);
1761 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1762}
1763inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1764{
1765 __m128i half = _mm_set1_epi8(0x7f);
1766 half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
1767 return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
1768}
1769inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1770{
1771 v_uint32x4 l, h;
1772 v_expand(v_absdiff(a, b), l, h);
1773 return v_reduce_sum(l + h);
1774}
1775inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1776{
1777 v_uint32x4 l, h;
1778 v_expand(v_absdiff(a, b), l, h);
1779 return v_reduce_sum(l + h);
1780}
1781inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1782{
1783 return v_reduce_sum(v_absdiff(a, b));
1784}
1785inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1786{
1787 return v_reduce_sum(v_absdiff(a, b));
1788}
1789inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1790{
1791 return v_reduce_sum(v_absdiff(a, b));
1792}
1793
1794inline v_uint8x16 v_popcount(const v_uint8x16& a)
1795{
1796 __m128i m1 = _mm_set1_epi32(0x55555555);
1797 __m128i m2 = _mm_set1_epi32(0x33333333);
1798 __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
1799 __m128i p = a.val;
1800 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
1801 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
1802 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
1803 return v_uint8x16(p);
1804}
1805inline v_uint16x8 v_popcount(const v_uint16x8& a)
1806{
1807 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1808 p += v_rotate_right<1>(p);
1809 return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1810}
1811inline v_uint32x4 v_popcount(const v_uint32x4& a)
1812{
1813 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1814 p += v_rotate_right<1>(p);
1815 p += v_rotate_right<2>(p);
1816 return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1817}
1818inline v_uint64x2 v_popcount(const v_uint64x2& a)
1819{
1820 return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
1821}
1822inline v_uint8x16 v_popcount(const v_int8x16& a)
1823{ return v_popcount(v_reinterpret_as_u8(a)); }
1824inline v_uint16x8 v_popcount(const v_int16x8& a)
1825{ return v_popcount(v_reinterpret_as_u16(a)); }
1826inline v_uint32x4 v_popcount(const v_int32x4& a)
1827{ return v_popcount(v_reinterpret_as_u32(a)); }
1828inline v_uint64x2 v_popcount(const v_int64x2& a)
1829{ return v_popcount(v_reinterpret_as_u64(a)); }
1830
1831#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
1832inline int v_signmask(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)); } \
1833inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
1834inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
1835OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
1836OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
1837OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, ps, _mm_castsi128_ps, 15)
1838OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, ps, _mm_castsi128_ps, 15)
1839OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint64x2, pd, _mm_castsi128_pd, 3)
1840OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int64x2, pd, _mm_castsi128_pd, 3)
1841OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, 15)
1842OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, 3)
1843
1844#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
1845inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
1846inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
1847inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
1848OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)
1849OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_int16x8)
1850
1851inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1852inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1853inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1854inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1855inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1856inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1857inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1858inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1859inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1860inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1861
1862#if CV_SSE4_1
1863#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
1864inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1865{ \
1866 return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
1867}
1868
1869OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1870OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1871OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1872OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1873OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1874OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1875// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
1876// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
1877OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
1878OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
1879
1880#else // CV_SSE4_1
1881
1882#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1883inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1884{ \
1885 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1886}
1887
1888OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1889OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1890OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1891OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1892OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1893OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1894// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1895// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1896OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1897OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1898#endif
1899
1900/* Expand */
1901#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1902 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1903 { \
1904 b0.val = intrin(a.val); \
1905 b1.val = __CV_CAT(intrin, _high)(a.val); \
1906 } \
1907 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1908 { return _Tpwvec(intrin(a.val)); } \
1909 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1910 { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1911 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1912 { \
1913 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1914 return _Tpwvec(intrin(a)); \
1915 }
1916
1917OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, _v128_cvtepu8_epi16)
1918OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16, v_int16x8, schar, _v128_cvtepi8_epi16)
1919OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, _v128_cvtepu16_epi32)
1920OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8, v_int32x4, short, _v128_cvtepi16_epi32)
1921OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2, unsigned, _v128_cvtepu32_epi64)
1922OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4, v_int64x2, int, _v128_cvtepi32_epi64)
1923
1924#define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin) \
1925 inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1926 { \
1927 typedef int CV_DECL_ALIGNED(1) unaligned_int; \
1928 __m128i a = _mm_cvtsi32_si128(*(const unaligned_int*)ptr); \
1929 return _Tpvec(intrin(a)); \
1930 }
1931
1932OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
1933OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4, schar, _v128_cvtepi8_epi32)
1934
1935#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1936inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1937{ \
1938 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1939 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1940} \
1941inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1942{ \
1943 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1944 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1945} \
1946inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1947{ \
1948 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1949 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1950} \
1951inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1952{ \
1953 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1954 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1955 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1956}
1957
1958OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1959OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1960OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1961OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1962OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1963OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1964OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1965OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1966
1967inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1968{
1969#if CV_SSSE3
1970 static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1971 return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
1972#else
1973 uchar CV_DECL_ALIGNED(32) d[16];
1974 v_store_aligned(d, a);
1975 return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
1976#endif
1977}
1978
1979inline v_int8x16 v_reverse(const v_int8x16 &a)
1980{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1981
1982inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1983{
1984#if CV_SSSE3
1985 static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
1986 return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
1987#else
1988 __m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
1989 r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1990 r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
1991 return v_uint16x8(r);
1992#endif
1993}
1994
1995inline v_int16x8 v_reverse(const v_int16x8 &a)
1996{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1997
1998inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1999{
2000 return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
2001}
2002
2003inline v_int32x4 v_reverse(const v_int32x4 &a)
2004{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
2005
2006inline v_float32x4 v_reverse(const v_float32x4 &a)
2007{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
2008
2009inline v_uint64x2 v_reverse(const v_uint64x2 &a)
2010{
2011 return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
2012}
2013
2014inline v_int64x2 v_reverse(const v_int64x2 &a)
2015{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
2016
2017inline v_float64x2 v_reverse(const v_float64x2 &a)
2018{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
2019
2020template<int s, typename _Tpvec>
2021inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
2022{
2023 return v_rotate_right<s>(a, b);
2024}
2025
2026inline v_int32x4 v_round(const v_float32x4& a)
2027{ return v_int32x4(_mm_cvtps_epi32(a.val)); }
2028
2029inline v_int32x4 v_floor(const v_float32x4& a)
2030{
2031 __m128i a1 = _mm_cvtps_epi32(a.val);
2032 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
2033 return v_int32x4(_mm_add_epi32(a1, mask));
2034}
2035
2036inline v_int32x4 v_ceil(const v_float32x4& a)
2037{
2038 __m128i a1 = _mm_cvtps_epi32(a.val);
2039 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
2040 return v_int32x4(_mm_sub_epi32(a1, mask));
2041}
2042
2043inline v_int32x4 v_trunc(const v_float32x4& a)
2044{ return v_int32x4(_mm_cvttps_epi32(a.val)); }
2045
2046inline v_int32x4 v_round(const v_float64x2& a)
2047{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
2048
2049inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2050{
2051 __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
2052 return v_int32x4(_mm_unpacklo_epi64(ai, bi));
2053}
2054
2055inline v_int32x4 v_floor(const v_float64x2& a)
2056{
2057 __m128i a1 = _mm_cvtpd_epi32(a.val);
2058 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
2059 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
2060 return v_int32x4(_mm_add_epi32(a1, mask));
2061}
2062
2063inline v_int32x4 v_ceil(const v_float64x2& a)
2064{
2065 __m128i a1 = _mm_cvtpd_epi32(a.val);
2066 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
2067 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
2068 return v_int32x4(_mm_sub_epi32(a1, mask));
2069}
2070
2071inline v_int32x4 v_trunc(const v_float64x2& a)
2072{ return v_int32x4(_mm_cvttpd_epi32(a.val)); }
2073
2074#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
2075inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
2076 const _Tpvec& a2, const _Tpvec& a3, \
2077 _Tpvec& b0, _Tpvec& b1, \
2078 _Tpvec& b2, _Tpvec& b3) \
2079{ \
2080 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
2081 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
2082 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
2083 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
2084\
2085 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
2086 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
2087 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
2088 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
2089}
2090
2091OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2092OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
2093OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
2094
2095// load deinterleave
2096inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
2097{
2098 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2099 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2100
2101 __m128i t10 = _mm_unpacklo_epi8(t00, t01);
2102 __m128i t11 = _mm_unpackhi_epi8(t00, t01);
2103
2104 __m128i t20 = _mm_unpacklo_epi8(t10, t11);
2105 __m128i t21 = _mm_unpackhi_epi8(t10, t11);
2106
2107 __m128i t30 = _mm_unpacklo_epi8(t20, t21);
2108 __m128i t31 = _mm_unpackhi_epi8(t20, t21);
2109
2110 a.val = _mm_unpacklo_epi8(t30, t31);
2111 b.val = _mm_unpackhi_epi8(t30, t31);
2112}
2113
2114inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
2115{
2116#if CV_SSE4_1
2117 const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2118 const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2119 __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
2120 __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2121 __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2122 __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
2123 __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
2124 __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
2125 const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
2126 const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
2127 const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
2128 a0 = _mm_shuffle_epi8(a0, sh_b);
2129 b0 = _mm_shuffle_epi8(b0, sh_g);
2130 c0 = _mm_shuffle_epi8(c0, sh_r);
2131 a.val = a0;
2132 b.val = b0;
2133 c.val = c0;
2134#elif CV_SSSE3
2135 const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
2136 const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
2137 const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
2138
2139 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
2140 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2141 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2142
2143 __m128i s0 = _mm_shuffle_epi8(t0, m0);
2144 __m128i s1 = _mm_shuffle_epi8(t1, m1);
2145 __m128i s2 = _mm_shuffle_epi8(t2, m2);
2146
2147 t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
2148 a.val = _mm_alignr_epi8(s2, t0, 5);
2149
2150 t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
2151 b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
2152
2153 t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
2154 c.val = _mm_alignr_epi8(t2, s0, 11);
2155#else
2156 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2157 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2158 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
2159
2160 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
2161 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
2162 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
2163
2164 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
2165 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
2166 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
2167
2168 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
2169 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
2170 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
2171
2172 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
2173 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
2174 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
2175#endif
2176}
2177
2178inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2179{
2180 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
2181 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
2182 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
2183 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
2184
2185 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
2186 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
2187 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
2188 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
2189
2190 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
2191 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
2192 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
2193 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
2194
2195 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
2196 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
2197 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
2198 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
2199
2200 a.val = _mm_unpacklo_epi8(v0, v1);
2201 b.val = _mm_unpackhi_epi8(v0, v1);
2202 c.val = _mm_unpacklo_epi8(v2, v3);
2203 d.val = _mm_unpackhi_epi8(v2, v3);
2204}
2205
2206inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2207{
2208 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
2209 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
2210
2211 __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
2212 __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
2213 __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
2214 __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
2215
2216 a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
2217 b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
2218}
2219
2220inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2221{
2222#if CV_SSE4_1
2223 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
2224 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
2225 __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
2226 __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
2227 __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
2228 __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
2229
2230 const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2231 const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
2232 const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2233 a0 = _mm_shuffle_epi8(a0, sh_a);
2234 b0 = _mm_shuffle_epi8(b0, sh_b);
2235 c0 = _mm_shuffle_epi8(c0, sh_c);
2236
2237 a.val = a0;
2238 b.val = b0;
2239 c.val = c0;
2240#else
2241 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2242 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
2243 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
2244
2245 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
2246 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
2247 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
2248
2249 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
2250 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
2251 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
2252
2253 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
2254 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
2255 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
2256#endif
2257}
2258
2259inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2260{
2261 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
2262 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
2263 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
2264 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
2265
2266 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
2267 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
2268 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
2269 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
2270
2271 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
2272 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
2273 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
2274 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
2275
2276 a.val = _mm_unpacklo_epi16(u0, u1);
2277 b.val = _mm_unpackhi_epi16(u0, u1);
2278 c.val = _mm_unpacklo_epi16(u2, u3);
2279 d.val = _mm_unpackhi_epi16(u2, u3);
2280}
2281
2282inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2283{
2284 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1
2285 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
2286
2287 __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
2288 __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
2289
2290 a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
2291 b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
2292}
2293
2294inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2295{
2296 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
2297 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
2298 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
2299
2300 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
2301 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
2302 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
2303
2304 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
2305 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
2306 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
2307}
2308
2309inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2310{
2311 v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
2312 v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
2313 v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
2314 v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
2315
2316 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2317}
2318
2319inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
2320{
2321 __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
2322 __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
2323
2324 a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3
2325 b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3
2326}
2327
2328inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
2329{
2330 __m128 t0 = _mm_loadu_ps(ptr + 0);
2331 __m128 t1 = _mm_loadu_ps(ptr + 4);
2332 __m128 t2 = _mm_loadu_ps(ptr + 8);
2333
2334 __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
2335 a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
2336
2337 __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
2338 __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
2339 b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
2340
2341 __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
2342 c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
2343}
2344
2345inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
2346{
2347 __m128 t0 = _mm_loadu_ps(ptr + 0);
2348 __m128 t1 = _mm_loadu_ps(ptr + 4);
2349 __m128 t2 = _mm_loadu_ps(ptr + 8);
2350 __m128 t3 = _mm_loadu_ps(ptr + 12);
2351 __m128 t02lo = _mm_unpacklo_ps(t0, t2);
2352 __m128 t13lo = _mm_unpacklo_ps(t1, t3);
2353 __m128 t02hi = _mm_unpackhi_ps(t0, t2);
2354 __m128 t13hi = _mm_unpackhi_ps(t1, t3);
2355 a.val = _mm_unpacklo_ps(t02lo, t13lo);
2356 b.val = _mm_unpackhi_ps(t02lo, t13lo);
2357 c.val = _mm_unpacklo_ps(t02hi, t13hi);
2358 d.val = _mm_unpackhi_ps(t02hi, t13hi);
2359}
2360
2361inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
2362{
2363 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
2364 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
2365
2366 a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
2367 b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
2368}
2369
2370inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2371{
2372 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
2373 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
2374 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
2375
2376 t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
2377
2378 a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
2379 b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
2380 c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
2381}
2382
2383inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
2384 v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2385{
2386 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
2387 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
2388 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
2389 __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
2390
2391 a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
2392 b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
2393 c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
2394 d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
2395}
2396
2397// store interleave
2398
2399inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2401{
2402 __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
2403 __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
2404
2405 if( mode == hal::STORE_ALIGNED_NOCACHE )
2406 {
2407 _mm_stream_si128((__m128i*)(ptr), v0);
2408 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2409 }
2410 else if( mode == hal::STORE_ALIGNED )
2411 {
2412 _mm_store_si128((__m128i*)(ptr), v0);
2413 _mm_store_si128((__m128i*)(ptr + 16), v1);
2414 }
2415 else
2416 {
2417 _mm_storeu_si128((__m128i*)(ptr), v0);
2418 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2419 }
2420}
2421
2422inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2424{
2425#if CV_SSE4_1
2426 const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2427 const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2428 const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2429 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2430 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2431 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2432
2433 const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2434 const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2435 __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
2436 __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
2437 __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
2438#elif CV_SSSE3
2439 const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
2440 const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
2441 const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
2442
2443 __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
2444 t0 = _mm_alignr_epi8(c.val, t0, 5);
2445 __m128i v0 = _mm_shuffle_epi8(t0, m0);
2446
2447 __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
2448 t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
2449 __m128i v1 = _mm_shuffle_epi8(t1, m1);
2450
2451 __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
2452 t2 = _mm_alignr_epi8(t2, a.val, 11);
2453 __m128i v2 = _mm_shuffle_epi8(t2, m2);
2454#else
2455 __m128i z = _mm_setzero_si128();
2456 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
2457 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
2458 __m128i c0 = _mm_unpacklo_epi8(c.val, z);
2459 __m128i c1 = _mm_unpackhi_epi8(c.val, z);
2460
2461 __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
2462 __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
2463 __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
2464 __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
2465
2466 __m128i p10 = _mm_unpacklo_epi32(p00, p01);
2467 __m128i p11 = _mm_unpackhi_epi32(p00, p01);
2468 __m128i p12 = _mm_unpacklo_epi32(p02, p03);
2469 __m128i p13 = _mm_unpackhi_epi32(p02, p03);
2470
2471 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2472 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2473 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2474 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2475
2476 p20 = _mm_slli_si128(p20, 1);
2477 p22 = _mm_slli_si128(p22, 1);
2478
2479 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
2480 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
2481 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
2482 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
2483
2484 __m128i p40 = _mm_unpacklo_epi64(p30, p31);
2485 __m128i p41 = _mm_unpackhi_epi64(p30, p31);
2486 __m128i p42 = _mm_unpacklo_epi64(p32, p33);
2487 __m128i p43 = _mm_unpackhi_epi64(p32, p33);
2488
2489 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
2490 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
2491 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
2492#endif
2493
2494 if( mode == hal::STORE_ALIGNED_NOCACHE )
2495 {
2496 _mm_stream_si128((__m128i*)(ptr), v0);
2497 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2498 _mm_stream_si128((__m128i*)(ptr + 32), v2);
2499 }
2500 else if( mode == hal::STORE_ALIGNED )
2501 {
2502 _mm_store_si128((__m128i*)(ptr), v0);
2503 _mm_store_si128((__m128i*)(ptr + 16), v1);
2504 _mm_store_si128((__m128i*)(ptr + 32), v2);
2505 }
2506 else
2507 {
2508 _mm_storeu_si128((__m128i*)(ptr), v0);
2509 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2510 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2511 }
2512}
2513
2514inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2515 const v_uint8x16& c, const v_uint8x16& d,
2517{
2518 // a0 a1 a2 a3 ....
2519 // b0 b1 b2 b3 ....
2520 // c0 c1 c2 c3 ....
2521 // d0 d1 d2 d3 ....
2522 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
2523 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
2524 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
2525 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
2526
2527 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
2528 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
2529 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
2530 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
2531
2532 if( mode == hal::STORE_ALIGNED_NOCACHE )
2533 {
2534 _mm_stream_si128((__m128i*)(ptr), v0);
2535 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2536 _mm_stream_si128((__m128i*)(ptr + 32), v2);
2537 _mm_stream_si128((__m128i*)(ptr + 48), v3);
2538 }
2539 else if( mode == hal::STORE_ALIGNED )
2540 {
2541 _mm_store_si128((__m128i*)(ptr), v0);
2542 _mm_store_si128((__m128i*)(ptr + 16), v1);
2543 _mm_store_si128((__m128i*)(ptr + 32), v2);
2544 _mm_store_si128((__m128i*)(ptr + 48), v3);
2545 }
2546 else
2547 {
2548 _mm_storeu_si128((__m128i*)(ptr), v0);
2549 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2550 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2551 _mm_storeu_si128((__m128i*)(ptr + 48), v3);
2552 }
2553}
2554
2555inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2557{
2558 __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
2559 __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
2560
2561 if( mode == hal::STORE_ALIGNED_NOCACHE )
2562 {
2563 _mm_stream_si128((__m128i*)(ptr), v0);
2564 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2565 }
2566 else if( mode == hal::STORE_ALIGNED )
2567 {
2568 _mm_store_si128((__m128i*)(ptr), v0);
2569 _mm_store_si128((__m128i*)(ptr + 8), v1);
2570 }
2571 else
2572 {
2573 _mm_storeu_si128((__m128i*)(ptr), v0);
2574 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2575 }
2576}
2577
2578inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2579 const v_uint16x8& b, const v_uint16x8& c,
2581{
2582#if CV_SSE4_1
2583 const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2584 const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2585 const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2586 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2587 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2588 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2589
2590 __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
2591 __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
2592 __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
2593#else
2594 __m128i z = _mm_setzero_si128();
2595 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
2596 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
2597 __m128i c0 = _mm_unpacklo_epi16(c.val, z);
2598 __m128i c1 = _mm_unpackhi_epi16(c.val, z);
2599
2600 __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
2601 __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
2602 __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
2603 __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
2604
2605 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2606 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2607 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2608 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2609
2610 p20 = _mm_slli_si128(p20, 2);
2611 p22 = _mm_slli_si128(p22, 2);
2612
2613 __m128i p30 = _mm_unpacklo_epi64(p20, p21);
2614 __m128i p31 = _mm_unpackhi_epi64(p20, p21);
2615 __m128i p32 = _mm_unpacklo_epi64(p22, p23);
2616 __m128i p33 = _mm_unpackhi_epi64(p22, p23);
2617
2618 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
2619 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
2620 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
2621#endif
2622 if( mode == hal::STORE_ALIGNED_NOCACHE )
2623 {
2624 _mm_stream_si128((__m128i*)(ptr), v0);
2625 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2626 _mm_stream_si128((__m128i*)(ptr + 16), v2);
2627 }
2628 else if( mode == hal::STORE_ALIGNED )
2629 {
2630 _mm_store_si128((__m128i*)(ptr), v0);
2631 _mm_store_si128((__m128i*)(ptr + 8), v1);
2632 _mm_store_si128((__m128i*)(ptr + 16), v2);
2633 }
2634 else
2635 {
2636 _mm_storeu_si128((__m128i*)(ptr), v0);
2637 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2638 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2639 }
2640}
2641
2642inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2643 const v_uint16x8& c, const v_uint16x8& d,
2645{
2646 // a0 a1 a2 a3 ....
2647 // b0 b1 b2 b3 ....
2648 // c0 c1 c2 c3 ....
2649 // d0 d1 d2 d3 ....
2650 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
2651 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
2652 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
2653 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
2654
2655 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
2656 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
2657 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
2658 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
2659
2660 if( mode == hal::STORE_ALIGNED_NOCACHE )
2661 {
2662 _mm_stream_si128((__m128i*)(ptr), v0);
2663 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2664 _mm_stream_si128((__m128i*)(ptr + 16), v2);
2665 _mm_stream_si128((__m128i*)(ptr + 24), v3);
2666 }
2667 else if( mode == hal::STORE_ALIGNED )
2668 {
2669 _mm_store_si128((__m128i*)(ptr), v0);
2670 _mm_store_si128((__m128i*)(ptr + 8), v1);
2671 _mm_store_si128((__m128i*)(ptr + 16), v2);
2672 _mm_store_si128((__m128i*)(ptr + 24), v3);
2673 }
2674 else
2675 {
2676 _mm_storeu_si128((__m128i*)(ptr), v0);
2677 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2678 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2679 _mm_storeu_si128((__m128i*)(ptr + 24), v3);
2680 }
2681}
2682
2683inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2685{
2686 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
2687 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
2688
2689 if( mode == hal::STORE_ALIGNED_NOCACHE )
2690 {
2691 _mm_stream_si128((__m128i*)(ptr), v0);
2692 _mm_stream_si128((__m128i*)(ptr + 4), v1);
2693 }
2694 else if( mode == hal::STORE_ALIGNED )
2695 {
2696 _mm_store_si128((__m128i*)(ptr), v0);
2697 _mm_store_si128((__m128i*)(ptr + 4), v1);
2698 }
2699 else
2700 {
2701 _mm_storeu_si128((__m128i*)(ptr), v0);
2702 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2703 }
2704}
2705
2706inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2708{
2709 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
2710 v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
2711
2712 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
2713 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
2714 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
2715
2716 if( mode == hal::STORE_ALIGNED_NOCACHE )
2717 {
2718 _mm_stream_si128((__m128i*)(ptr), v0);
2719 _mm_stream_si128((__m128i*)(ptr + 4), v1);
2720 _mm_stream_si128((__m128i*)(ptr + 8), v2);
2721 }
2722 else if( mode == hal::STORE_ALIGNED )
2723 {
2724 _mm_store_si128((__m128i*)(ptr), v0);
2725 _mm_store_si128((__m128i*)(ptr + 4), v1);
2726 _mm_store_si128((__m128i*)(ptr + 8), v2);
2727 }
2728 else
2729 {
2730 _mm_storeu_si128((__m128i*)(ptr), v0);
2731 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2732 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2733 }
2734}
2735
2736inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2737 const v_uint32x4& c, const v_uint32x4& d,
2739{
2740 v_uint32x4 v0, v1, v2, v3;
2741 v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2742
2743 if( mode == hal::STORE_ALIGNED_NOCACHE )
2744 {
2745 _mm_stream_si128((__m128i*)(ptr), v0.val);
2746 _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
2747 _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
2748 _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
2749 }
2750 else if( mode == hal::STORE_ALIGNED )
2751 {
2752 _mm_store_si128((__m128i*)(ptr), v0.val);
2753 _mm_store_si128((__m128i*)(ptr + 4), v1.val);
2754 _mm_store_si128((__m128i*)(ptr + 8), v2.val);
2755 _mm_store_si128((__m128i*)(ptr + 12), v3.val);
2756 }
2757 else
2758 {
2759 _mm_storeu_si128((__m128i*)(ptr), v0.val);
2760 _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
2761 _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
2762 _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
2763 }
2764}
2765
2766// 2-channel, float only
2767inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2769{
2770 __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
2771 __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
2772
2773 if( mode == hal::STORE_ALIGNED_NOCACHE )
2774 {
2775 _mm_stream_ps(ptr, v0);
2776 _mm_stream_ps(ptr + 4, v1);
2777 }
2778 else if( mode == hal::STORE_ALIGNED )
2779 {
2780 _mm_store_ps(ptr, v0);
2781 _mm_store_ps(ptr + 4, v1);
2782 }
2783 else
2784 {
2785 _mm_storeu_ps(ptr, v0);
2786 _mm_storeu_ps(ptr + 4, v1);
2787 }
2788}
2789
2790inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2792{
2793 __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
2794 __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
2795 __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2796 __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
2797 __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
2798 __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
2799 __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
2800 __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
2801 __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
2802
2803 if( mode == hal::STORE_ALIGNED_NOCACHE )
2804 {
2805 _mm_stream_ps(ptr, v0);
2806 _mm_stream_ps(ptr + 4, v1);
2807 _mm_stream_ps(ptr + 8, v2);
2808 }
2809 else if( mode == hal::STORE_ALIGNED )
2810 {
2811 _mm_store_ps(ptr, v0);
2812 _mm_store_ps(ptr + 4, v1);
2813 _mm_store_ps(ptr + 8, v2);
2814 }
2815 else
2816 {
2817 _mm_storeu_ps(ptr, v0);
2818 _mm_storeu_ps(ptr + 4, v1);
2819 _mm_storeu_ps(ptr + 8, v2);
2820 }
2821}
2822
2823inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2824 const v_float32x4& c, const v_float32x4& d,
2826{
2827 __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
2828 __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
2829 __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
2830 __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
2831 __m128 v0 = _mm_unpacklo_ps(u0, u1);
2832 __m128 v2 = _mm_unpacklo_ps(u2, u3);
2833 __m128 v1 = _mm_unpackhi_ps(u0, u1);
2834 __m128 v3 = _mm_unpackhi_ps(u2, u3);
2835
2836 if( mode == hal::STORE_ALIGNED_NOCACHE )
2837 {
2838 _mm_stream_ps(ptr, v0);
2839 _mm_stream_ps(ptr + 4, v1);
2840 _mm_stream_ps(ptr + 8, v2);
2841 _mm_stream_ps(ptr + 12, v3);
2842 }
2843 else if( mode == hal::STORE_ALIGNED )
2844 {
2845 _mm_store_ps(ptr, v0);
2846 _mm_store_ps(ptr + 4, v1);
2847 _mm_store_ps(ptr + 8, v2);
2848 _mm_store_ps(ptr + 12, v3);
2849 }
2850 else
2851 {
2852 _mm_storeu_ps(ptr, v0);
2853 _mm_storeu_ps(ptr + 4, v1);
2854 _mm_storeu_ps(ptr + 8, v2);
2855 _mm_storeu_ps(ptr + 12, v3);
2856 }
2857}
2858
2859inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2861{
2862 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2863 __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
2864
2865 if( mode == hal::STORE_ALIGNED_NOCACHE )
2866 {
2867 _mm_stream_si128((__m128i*)(ptr), v0);
2868 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2869 }
2870 else if( mode == hal::STORE_ALIGNED )
2871 {
2872 _mm_store_si128((__m128i*)(ptr), v0);
2873 _mm_store_si128((__m128i*)(ptr + 2), v1);
2874 }
2875 else
2876 {
2877 _mm_storeu_si128((__m128i*)(ptr), v0);
2878 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2879 }
2880}
2881
2882inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2884{
2885 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2886 __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
2887 __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
2888
2889 if( mode == hal::STORE_ALIGNED_NOCACHE )
2890 {
2891 _mm_stream_si128((__m128i*)(ptr), v0);
2892 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2893 _mm_stream_si128((__m128i*)(ptr + 4), v2);
2894 }
2895 else if( mode == hal::STORE_ALIGNED )
2896 {
2897 _mm_store_si128((__m128i*)(ptr), v0);
2898 _mm_store_si128((__m128i*)(ptr + 2), v1);
2899 _mm_store_si128((__m128i*)(ptr + 4), v2);
2900 }
2901 else
2902 {
2903 _mm_storeu_si128((__m128i*)(ptr), v0);
2904 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2905 _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2906 }
2907}
2908
2909inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2910 const v_uint64x2& c, const v_uint64x2& d,
2912{
2913 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2914 __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
2915 __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
2916 __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
2917
2918 if( mode == hal::STORE_ALIGNED_NOCACHE )
2919 {
2920 _mm_stream_si128((__m128i*)(ptr), v0);
2921 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2922 _mm_stream_si128((__m128i*)(ptr + 4), v2);
2923 _mm_stream_si128((__m128i*)(ptr + 6), v3);
2924 }
2925 else if( mode == hal::STORE_ALIGNED )
2926 {
2927 _mm_store_si128((__m128i*)(ptr), v0);
2928 _mm_store_si128((__m128i*)(ptr + 2), v1);
2929 _mm_store_si128((__m128i*)(ptr + 4), v2);
2930 _mm_store_si128((__m128i*)(ptr + 6), v3);
2931 }
2932 else
2933 {
2934 _mm_storeu_si128((__m128i*)(ptr), v0);
2935 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2936 _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2937 _mm_storeu_si128((__m128i*)(ptr + 6), v3);
2938 }
2939}
2940
2941#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2942inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2943{ \
2944 _Tpvec1 a1, b1; \
2945 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2946 a0 = v_reinterpret_as_##suffix0(a1); \
2947 b0 = v_reinterpret_as_##suffix0(b1); \
2948} \
2949inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2950{ \
2951 _Tpvec1 a1, b1, c1; \
2952 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2953 a0 = v_reinterpret_as_##suffix0(a1); \
2954 b0 = v_reinterpret_as_##suffix0(b1); \
2955 c0 = v_reinterpret_as_##suffix0(c1); \
2956} \
2957inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2958{ \
2959 _Tpvec1 a1, b1, c1, d1; \
2960 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2961 a0 = v_reinterpret_as_##suffix0(a1); \
2962 b0 = v_reinterpret_as_##suffix0(b1); \
2963 c0 = v_reinterpret_as_##suffix0(c1); \
2964 d0 = v_reinterpret_as_##suffix0(d1); \
2965} \
2966inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2967 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2968{ \
2969 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2970 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2971 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2972} \
2973inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2974 const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2975{ \
2976 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2977 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2978 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2979 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2980} \
2981inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2982 const _Tpvec0& c0, const _Tpvec0& d0, \
2983 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2984{ \
2985 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2986 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2987 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2988 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2989 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2990}
2991
2992OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2993OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2994OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2995OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2996OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2997
2998inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2999{
3000 return v_float32x4(_mm_cvtepi32_ps(a.val));
3001}
3002
3003inline v_float32x4 v_cvt_f32(const v_float64x2& a)
3004{
3005 return v_float32x4(_mm_cvtpd_ps(a.val));
3006}
3007
3008inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
3009{
3010 return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
3011}
3012
3013inline v_float64x2 v_cvt_f64(const v_int32x4& a)
3014{
3015 return v_float64x2(_mm_cvtepi32_pd(a.val));
3016}
3017
3018inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
3019{
3020 return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
3021}
3022
3023inline v_float64x2 v_cvt_f64(const v_float32x4& a)
3024{
3025 return v_float64x2(_mm_cvtps_pd(a.val));
3026}
3027
3028inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
3029{
3030 return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
3031}
3032
3033// from (Mysticial and wim) https://stackoverflow.com/q/41144668
3034inline v_float64x2 v_cvt_f64(const v_int64x2& v)
3035{
3036 // constants encoded as floating-point
3037 __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
3038 __m128i magic_i_all = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
3039 __m128d magic_d_all = _mm_castsi128_pd(magic_i_all);
3040 // Blend the 32 lowest significant bits of v with magic_int_lo
3041#if CV_SSE4_1
3042 __m128i magic_i_lo = _mm_set1_epi64x(0x4330000000000000); // 2^52
3043 __m128i v_lo = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
3044#else
3045 __m128i magic_i_lo = _mm_set1_epi32(0x43300000); // 2^52
3046 __m128i v_lo = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
3047#endif
3048 // Extract the 32 most significant bits of v
3049 __m128i v_hi = _mm_srli_epi64(v.val, 32);
3050 // Flip the msb of v_hi and blend with 0x45300000
3051 v_hi = _mm_xor_si128(v_hi, magic_i_hi32);
3052 // Compute in double precision
3053 __m128d v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
3054 // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
3055 __m128d result = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
3056 return v_float64x2(result);
3057}
3058
3060
3061inline v_int8x16 v_lut(const schar* tab, const int* idx)
3062{
3063#if defined(_MSC_VER)
3064 return v_int8x16(_mm_setr_epi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
3065 tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
3066#else
3067 return v_int8x16(_mm_setr_epi64(
3068 _mm_setr_pi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]]),
3069 _mm_setr_pi8(tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])
3070 ));
3071#endif
3072}
3073inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
3074{
3075#if defined(_MSC_VER)
3076 return v_int8x16(_mm_setr_epi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]),
3077 *(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
3078#else
3079 return v_int8x16(_mm_setr_epi64(
3080 _mm_setr_pi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3])),
3081 _mm_setr_pi16(*(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7]))
3082 ));
3083#endif
3084}
3085inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
3086{
3087#if defined(_MSC_VER)
3088 return v_int8x16(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
3089 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
3090#else
3091 return v_int8x16(_mm_setr_epi64(
3092 _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
3093 _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
3094 ));
3095#endif
3096}
3097inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
3098inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
3099inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
3100
3101inline v_int16x8 v_lut(const short* tab, const int* idx)
3102{
3103#if defined(_MSC_VER)
3104 return v_int16x8(_mm_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
3105 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
3106#else
3107 return v_int16x8(_mm_setr_epi64(
3108 _mm_setr_pi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]),
3109 _mm_setr_pi16(tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])
3110 ));
3111#endif
3112}
3113inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
3114{
3115#if defined(_MSC_VER)
3116 return v_int16x8(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
3117 *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
3118#else
3119 return v_int16x8(_mm_setr_epi64(
3120 _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
3121 _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
3122 ));
3123#endif
3124}
3125inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
3126{
3127 return v_int16x8(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
3128}
3129inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
3130inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
3131inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
3132
3133inline v_int32x4 v_lut(const int* tab, const int* idx)
3134{
3135#if defined(_MSC_VER)
3136 return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]],
3137 tab[idx[2]], tab[idx[3]]));
3138#else
3139 return v_int32x4(_mm_setr_epi64(
3140 _mm_setr_pi32(tab[idx[0]], tab[idx[1]]),
3141 _mm_setr_pi32(tab[idx[2]], tab[idx[3]])
3142 ));
3143#endif
3144}
3145inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
3146{
3147 return v_int32x4(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
3148}
3149inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
3150{
3151 return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
3152}
3153inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
3154inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
3155inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
3156
3157inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
3158{
3159 return v_int64x2(_mm_set_epi64x(tab[idx[1]], tab[idx[0]]));
3160}
3161inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
3162{
3163 return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
3164}
3165inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
3166inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
3167
3168inline v_float32x4 v_lut(const float* tab, const int* idx)
3169{
3170 return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3171}
3172inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
3173inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
3174
3175inline v_float64x2 v_lut(const double* tab, const int* idx)
3176{
3177 return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3178}
3179inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); }
3180
3181inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
3182{
3183 int CV_DECL_ALIGNED(32) idx[4];
3184 v_store_aligned(idx, idxvec);
3185 return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3186}
3187
3188inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
3189{
3190 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
3191}
3192
3193inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
3194{
3195 int CV_DECL_ALIGNED(32) idx[4];
3196 v_store_aligned(idx, idxvec);
3197 return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
3198}
3199
3200inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
3201{
3202 int idx[2];
3203 v_store_low(idx, idxvec);
3204 return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
3205}
3206
3207// loads pairs from the table and deinterleaves them, e.g. returns:
3208// x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
3209// y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
3210// note that the indices are float's indices, not the float-pair indices.
3211// in theory, this function can be used to implement bilinear interpolation,
3212// when idxvec are the offsets within the image.
3213inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
3214{
3215 int CV_DECL_ALIGNED(32) idx[4];
3216 v_store_aligned(idx, idxvec);
3217 __m128 z = _mm_setzero_ps();
3218 __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
3219 __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
3220 xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
3221 xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
3222 __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
3223 __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
3224 x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
3225 y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
3226}
3227
3228inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
3229{
3230 int idx[2];
3231 v_store_low(idx, idxvec);
3232 __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
3233 __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
3234 x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
3235 y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
3236}
3237
3238inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
3239{
3240#if CV_SSSE3
3241 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200)));
3242#else
3243 __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3244 a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0));
3245 a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
3246 return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3247#endif
3248}
3249inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
3250inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
3251{
3252#if CV_SSSE3
3253 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400)));
3254#else
3255 __m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3256 return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
3257#endif
3258}
3259inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
3260
3261inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
3262{
3263#if CV_SSSE3
3264 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100)));
3265#else
3266 __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
3267 return v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)));
3268#endif
3269}
3270inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
3271inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
3272{
3273#if CV_SSSE3
3274 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100)));
3275#else
3276 return v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val)));
3277#endif
3278}
3279inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
3280
3281inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
3282{
3283 return v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
3284}
3285inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
3286inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
3287
3288inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
3289{
3290#if CV_SSSE3
3291 return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
3292#else
3293 __m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
3294 __m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
3295 return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3296#endif
3297}
3298inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
3299
3300inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
3301{
3302#if CV_SSSE3
3303 return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100)));
3304#else
3305 return v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2));
3306#endif
3307}
3308inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
3309
3310inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
3311inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
3312inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
3313
3314template<int i>
3315inline uchar v_extract_n(const v_uint8x16& v)
3316{
3317#if CV_SSE4_1
3318 return (uchar)_mm_extract_epi8(v.val, i);
3319#else
3320 return v_rotate_right<i>(v).get0();
3321#endif
3322}
3323
3324template<int i>
3325inline schar v_extract_n(const v_int8x16& v)
3326{
3327 return (schar)v_extract_n<i>(v_reinterpret_as_u8(v));
3328}
3329
3330template<int i>
3331inline ushort v_extract_n(const v_uint16x8& v)
3332{
3333 return (ushort)_mm_extract_epi16(v.val, i);
3334}
3335
3336template<int i>
3337inline short v_extract_n(const v_int16x8& v)
3338{
3339 return (short)v_extract_n<i>(v_reinterpret_as_u16(v));
3340}
3341
3342template<int i>
3343inline uint v_extract_n(const v_uint32x4& v)
3344{
3345#if CV_SSE4_1
3346 return (uint)_mm_extract_epi32(v.val, i);
3347#else
3348 return v_rotate_right<i>(v).get0();
3349#endif
3350}
3351
3352template<int i>
3353inline int v_extract_n(const v_int32x4& v)
3354{
3355 return (int)v_extract_n<i>(v_reinterpret_as_u32(v));
3356}
3357
3358template<int i>
3359inline uint64 v_extract_n(const v_uint64x2& v)
3360{
3361#ifdef CV__SIMD_NATIVE_mm_extract_epi64
3362 return (uint64)_v128_extract_epi64<i>(v.val);
3363#else
3364 return v_rotate_right<i>(v).get0();
3365#endif
3366}
3367
3368template<int i>
3369inline int64 v_extract_n(const v_int64x2& v)
3370{
3371 return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
3372}
3373
3374template<int i>
3375inline float v_extract_n(const v_float32x4& v)
3376{
3377 union { uint iv; float fv; } d;
3378 d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
3379 return d.fv;
3380}
3381
3382template<int i>
3383inline double v_extract_n(const v_float64x2& v)
3384{
3385 union { uint64 iv; double dv; } d;
3386 d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
3387 return d.dv;
3388}
3389
3390template<int i>
3392{
3393 return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3394}
3395
3396template<int i>
3398{
3399 return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
3400}
3401
3402template<int i>
3404{
3405 return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((char)i,(char)i,(char)i,(char)i)));
3406}
3407
3409
3410inline v_float32x4 v_load_expand(const hfloat* ptr)
3411{
3412#if CV_FP16
3413 return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
3414#else
3415 const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
3416 const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
3417 const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
3418 __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
3419 __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
3420 __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
3421 __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
3422
3423 t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
3424 __m128i zmask = _mm_cmpeq_epi32(e, z);
3425 __m128i ft = v_select_si128(zmask, zt, t);
3426 return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
3427#endif
3428}
3429
3430inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
3431{
3432#if CV_FP16
3433 __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
3434 _mm_storel_epi64((__m128i*)ptr, fp16_value);
3435#else
3436 const __m128i signmask = _mm_set1_epi32(0x80000000);
3437 const __m128i rval = _mm_set1_epi32(0x3f000000);
3438
3439 __m128i t = _mm_castps_si128(v.val);
3440 __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
3441 t = _mm_andnot_si128(signmask, t);
3442
3443 __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
3444 __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
3445 __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
3446 __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
3447 __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
3448 tt = _mm_sub_epi32(tt, rval);
3449 __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
3450 __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
3451 nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
3452 t = v_select_si128(tinymask, tt, nt);
3453 t = v_select_si128(finitemask, t, naninf);
3454 t = _mm_or_si128(t, sign);
3455 t = _mm_packs_epi32(t, t);
3456 _mm_storel_epi64((__m128i*)ptr, t);
3457#endif
3458}
3459
3460inline void v_cleanup() {}
3461
3462CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3463
3465
3466}
3467
3468#endif
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr const CvArr CvArr * result
Definition core_c.h:1423
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition intrin_cpp.hpp:2216
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition intrin_cpp.hpp:1496
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition intrin_cpp.hpp:1474
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition intrin_cpp.hpp:2761
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition intrin_cpp.hpp:1515
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract.
Definition intrin_cpp.hpp:2371
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
CvRect r
Definition imgproc_c.h:984
CvSize int int int CvPoint int delta
Definition imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
T isnan(T... args)
T max(T... args)
T min(T... args)
StoreMode
Definition intrin.hpp:100
@ STORE_ALIGNED_NOCACHE
Definition intrin.hpp:103
@ STORE_ALIGNED
Definition intrin.hpp:102
@ STORE_UNALIGNED
Definition intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441