EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_wasm.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html.
4
5#ifndef OPENCV_HAL_INTRIN_WASM_HPP
6#define OPENCV_HAL_INTRIN_WASM_HPP
7
8#include <limits>
9#include <cstring>
10#include <algorithm>
11#include <emscripten/version.h>
12#include "opencv2/core/saturate.hpp"
13
14#define CV_SIMD128 1
15#define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it.
16#define CV_SIMD128_FP16 0
17
18namespace cv
19{
20
22
23CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
24
25#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
26// handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673)
27#define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
28#define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
29#define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
30#define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
31#define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
32#define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
33#define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
34#define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
35#endif // COMPATIBILITY: <1.38.46
36
38
39struct v_uint8x16
40{
41 typedef uchar lane_type;
42 typedef v128_t vector_type;
43 enum { nlanes = 16 };
44
45 v_uint8x16() {}
46 explicit v_uint8x16(v128_t v) : val(v) {}
47 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
48 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
49 {
50 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
51 val = wasm_v128_load(v);
52 }
53
54 uchar get0() const
55 {
56 return (uchar)wasm_i8x16_extract_lane(val, 0);
57 }
58
59 v128_t val;
60};
61
62struct v_int8x16
63{
64 typedef schar lane_type;
65 typedef v128_t vector_type;
66 enum { nlanes = 16 };
67
68 v_int8x16() {}
69 explicit v_int8x16(v128_t v) : val(v) {}
70 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
71 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
72 {
73 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
74 val = wasm_v128_load(v);
75 }
76
77 schar get0() const
78 {
79 return wasm_i8x16_extract_lane(val, 0);
80 }
81
82 v128_t val;
83};
84
85struct v_uint16x8
86{
87 typedef ushort lane_type;
88 typedef v128_t vector_type;
89 enum { nlanes = 8 };
90
91 v_uint16x8() {}
92 explicit v_uint16x8(v128_t v) : val(v) {}
93 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
94 {
95 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96 val = wasm_v128_load(v);
97 }
98
99 ushort get0() const
100 {
101 return (ushort)wasm_i16x8_extract_lane(val, 0); // wasm_u16x8_extract_lane() unimplemented yet
102 }
103
104 v128_t val;
105};
106
107struct v_int16x8
108{
109 typedef short lane_type;
110 typedef v128_t vector_type;
111 enum { nlanes = 8 };
112
113 v_int16x8() {}
114 explicit v_int16x8(v128_t v) : val(v) {}
115 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
116 {
117 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
118 val = wasm_v128_load(v);
119 }
120
121 short get0() const
122 {
123 return wasm_i16x8_extract_lane(val, 0);
124 }
125
126 v128_t val;
127};
128
129struct v_uint32x4
130{
131 typedef unsigned lane_type;
132 typedef v128_t vector_type;
133 enum { nlanes = 4 };
134
135 v_uint32x4() {}
136 explicit v_uint32x4(v128_t v) : val(v) {}
137 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
138 {
139 unsigned v[] = {v0, v1, v2, v3};
140 val = wasm_v128_load(v);
141 }
142
143 unsigned get0() const
144 {
145 return (unsigned)wasm_i32x4_extract_lane(val, 0);
146 }
147
148 v128_t val;
149};
150
151struct v_int32x4
152{
153 typedef int lane_type;
154 typedef v128_t vector_type;
155 enum { nlanes = 4 };
156
157 v_int32x4() {}
158 explicit v_int32x4(v128_t v) : val(v) {}
159 v_int32x4(int v0, int v1, int v2, int v3)
160 {
161 int v[] = {v0, v1, v2, v3};
162 val = wasm_v128_load(v);
163 }
164
165 int get0() const
166 {
167 return wasm_i32x4_extract_lane(val, 0);
168 }
169
170 v128_t val;
171};
172
173struct v_float32x4
174{
175 typedef float lane_type;
176 typedef v128_t vector_type;
177 enum { nlanes = 4 };
178
179 v_float32x4() {}
180 explicit v_float32x4(v128_t v) : val(v) {}
181 v_float32x4(float v0, float v1, float v2, float v3)
182 {
183 float v[] = {v0, v1, v2, v3};
184 val = wasm_v128_load(v);
185 }
186
187 float get0() const
188 {
189 return wasm_f32x4_extract_lane(val, 0);
190 }
191
192 v128_t val;
193};
194
195struct v_uint64x2
196{
197 typedef uint64 lane_type;
198 typedef v128_t vector_type;
199 enum { nlanes = 2 };
200
201 v_uint64x2() {}
202 explicit v_uint64x2(v128_t v) : val(v) {}
203 v_uint64x2(uint64 v0, uint64 v1)
204 {
205 uint64 v[] = {v0, v1};
206 val = wasm_v128_load(v);
207 }
208
209 uint64 get0() const
210 {
211 return (uint64)wasm_i64x2_extract_lane(val, 0);
212 }
213
214 v128_t val;
215};
216
217struct v_int64x2
218{
219 typedef int64 lane_type;
220 typedef v128_t vector_type;
221 enum { nlanes = 2 };
222
223 v_int64x2() {}
224 explicit v_int64x2(v128_t v) : val(v) {}
225 v_int64x2(int64 v0, int64 v1)
226 {
227 int64 v[] = {v0, v1};
228 val = wasm_v128_load(v);
229 }
230
231 int64 get0() const
232 {
233 return wasm_i64x2_extract_lane(val, 0);
234 }
235
236 v128_t val;
237};
238
239struct v_float64x2
240{
241 typedef double lane_type;
242 typedef v128_t vector_type;
243 enum { nlanes = 2 };
244
245 v_float64x2() {}
246 explicit v_float64x2(v128_t v) : val(v) {}
247 v_float64x2(double v0, double v1)
248 {
249 double v[] = {v0, v1};
250 val = wasm_v128_load(v);
251 }
252
253 double get0() const
254 {
255 return wasm_f64x2_extract_lane(val, 0);
256 }
257
258 v128_t val;
259};
260
261namespace
262{
263#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
264inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
265OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
266OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
267OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
268OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
269OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
270OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
271OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
272OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
273OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
274OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
275
276static const unsigned char popCountTable[] =
277{
278 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
279 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
280 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
281 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
282 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
283 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
284 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
285 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
286 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
287 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
288 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
289 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
290 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
291 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
292 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
293 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
294};
295} // namespace
296
297static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
298 return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
299}
300
301static v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) {
302 return wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
303}
304
305static v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) {
306 return wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
307}
308
309static v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) {
310 return wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
311}
312
313static v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) {
314 return wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
315}
316
317static v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) {
318 return wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
319}
320
321static v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) {
322 return wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
323}
324
325static v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
326 return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
327}
328
330// 8 >> 16
331inline v128_t v128_cvtu8x16_i16x8(const v128_t& a)
332{
333 const v128_t z = wasm_i8x16_splat(0);
334 return wasm_unpacklo_i8x16(a, z);
335}
336inline v128_t v128_cvti8x16_i16x8(const v128_t& a)
337{ return wasm_i16x8_shr(wasm_unpacklo_i8x16(a, a), 8); }
338// 8 >> 32
339inline v128_t v128_cvtu8x16_i32x4(const v128_t& a)
340{
341 const v128_t z = wasm_i8x16_splat(0);
342 return wasm_unpacklo_i16x8(wasm_unpacklo_i8x16(a, z), z);
343}
344inline v128_t v128_cvti8x16_i32x4(const v128_t& a)
345{
346 v128_t r = wasm_unpacklo_i8x16(a, a);
347 r = wasm_unpacklo_i8x16(r, r);
348 return wasm_i32x4_shr(r, 24);
349}
350// 16 >> 32
351inline v128_t v128_cvtu16x8_i32x4(const v128_t& a)
352{
353 const v128_t z = wasm_i8x16_splat(0);
354 return wasm_unpacklo_i16x8(a, z);
355}
356inline v128_t v128_cvti16x8_i32x4(const v128_t& a)
357{ return wasm_i32x4_shr(wasm_unpacklo_i16x8(a, a), 16); }
358// 32 >> 64
359inline v128_t v128_cvtu32x4_i64x2(const v128_t& a)
360{
361 const v128_t z = wasm_i8x16_splat(0);
362 return wasm_unpacklo_i32x4(a, z);
363}
364inline v128_t v128_cvti32x4_i64x2(const v128_t& a)
365{ return wasm_unpacklo_i32x4(a, wasm_i32x4_shr(a, 31)); }
366
367// 16 << 8
368inline v128_t v128_cvtu8x16_i16x8_high(const v128_t& a)
369{
370 const v128_t z = wasm_i8x16_splat(0);
371 return wasm_unpackhi_i8x16(a, z);
372}
373inline v128_t v128_cvti8x16_i16x8_high(const v128_t& a)
374{ return wasm_i16x8_shr(wasm_unpackhi_i8x16(a, a), 8); }
375// 32 << 16
376inline v128_t v128_cvtu16x8_i32x4_high(const v128_t& a)
377{
378 const v128_t z = wasm_i8x16_splat(0);
379 return wasm_unpackhi_i16x8(a, z);
380}
381inline v128_t v128_cvti16x8_i32x4_high(const v128_t& a)
382{ return wasm_i32x4_shr(wasm_unpackhi_i16x8(a, a), 16); }
383// 64 << 32
384inline v128_t v128_cvtu32x4_i64x2_high(const v128_t& a)
385{
386 const v128_t z = wasm_i8x16_splat(0);
387 return wasm_unpackhi_i32x4(a, z);
388}
389inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a)
390{ return wasm_unpackhi_i32x4(a, wasm_i32x4_shr(a, 31)); }
391
392#define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
393inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
394inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
395template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
396{ return _Tpvec(a.val); }
397
398OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16, uchar, u8, i8x16, schar)
399OPENCV_HAL_IMPL_WASM_INITVEC(v_int8x16, schar, s8, i8x16, schar)
400OPENCV_HAL_IMPL_WASM_INITVEC(v_uint16x8, ushort, u16, i16x8, short)
401OPENCV_HAL_IMPL_WASM_INITVEC(v_int16x8, short, s16, i16x8, short)
402OPENCV_HAL_IMPL_WASM_INITVEC(v_uint32x4, unsigned, u32, i32x4, int)
403OPENCV_HAL_IMPL_WASM_INITVEC(v_int32x4, int, s32, i32x4, int)
404OPENCV_HAL_IMPL_WASM_INITVEC(v_float32x4, float, f32, f32x4, float)
405OPENCV_HAL_IMPL_WASM_INITVEC(v_uint64x2, uint64, u64, i64x2, int64)
406OPENCV_HAL_IMPL_WASM_INITVEC(v_int64x2, int64, s64, i64x2, int64)
407OPENCV_HAL_IMPL_WASM_INITVEC(v_float64x2, double, f64, f64x2, double)
408
409
410inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
411{
412 v128_t maxval = wasm_i16x8_splat(255);
413 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
414 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
415 return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
416}
417inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
418{
419 v128_t maxval = wasm_i16x8_splat(127);
420 v128_t minval = wasm_i16x8_splat(-128);
421 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
422 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
423 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
424 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
425 return v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
426}
427inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
428{
429 v128_t maxval = wasm_i32x4_splat(65535);
430 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
431 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
432 return v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
433}
434inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
435{
436 v128_t maxval = wasm_i32x4_splat(32767);
437 v128_t minval = wasm_i32x4_splat(-32768);
438 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
439 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
440 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
441 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
442 return v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
443}
444inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
445{
446 return v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
447}
448inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
449{
450 return v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
451}
452inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
453{
454 v128_t maxval = wasm_i16x8_splat(255);
455 v128_t minval = wasm_i16x8_splat(0);
456 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
457 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
458 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
459 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
460 return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
461}
462inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
463{
464 v128_t maxval = wasm_i32x4_splat(65535);
465 v128_t minval = wasm_i32x4_splat(0);
466 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
467 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
468 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
469 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
470 return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
471}
472
473template<int n>
474inline v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
475{
476 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
477 v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
478 v128_t b1 = wasm_u16x8_shr(wasm_i16x8_add(b.val, delta), n);
479 v128_t maxval = wasm_i16x8_splat(255);
480 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
481 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval));
482 return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
483}
484template<int n>
485inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
486{
487 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
488 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
489 v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
490 v128_t maxval = wasm_i16x8_splat(127);
491 v128_t minval = wasm_i16x8_splat(-128);
492 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
493 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
494 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
495 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
496 return v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
497}
498template<int n>
499inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
500{
501 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
502 v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
503 v128_t b1 = wasm_u32x4_shr(wasm_i32x4_add(b.val, delta), n);
504 v128_t maxval = wasm_i32x4_splat(65535);
505 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
506 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval));
507 return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
508}
509template<int n>
510inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
511{
512 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
513 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
514 v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
515 v128_t maxval = wasm_i32x4_splat(32767);
516 v128_t minval = wasm_i16x8_splat(-32768);
517 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
518 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
519 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
520 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
521 return v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
522}
523template<int n>
524inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
525{
526 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
527 v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
528 v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n);
529 return v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
530}
531template<int n>
532inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
533{
534 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
535 v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
536 v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n);
537 return v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
538}
539template<int n>
540inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
541{
542 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
543 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
544 v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
545 v128_t maxval = wasm_i16x8_splat(255);
546 v128_t minval = wasm_i16x8_splat(0);
547 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
548 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
549 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
550 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
551 return v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
552}
553template<int n>
554inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
555{
556 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
557 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
558 v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
559 v128_t maxval = wasm_i32x4_splat(65535);
560 v128_t minval = wasm_i16x8_splat(0);
561 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
562 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
563 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
564 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
565 return v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
566}
567
568inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
569{
570 v128_t maxval = wasm_i16x8_splat(255);
571 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
572 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
573 uchar t_ptr[16];
574 wasm_v128_store(t_ptr, r);
575 for (int i=0; i<8; ++i) {
576 ptr[i] = t_ptr[i];
577 }
578}
579inline void v_pack_store(schar* ptr, const v_int16x8& a)
580{
581 v128_t maxval = wasm_i16x8_splat(127);
582 v128_t minval = wasm_i16x8_splat(-128);
583 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
584 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
585 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
586 schar t_ptr[16];
587 wasm_v128_store(t_ptr, r);
588 for (int i=0; i<8; ++i) {
589 ptr[i] = t_ptr[i];
590 }
591}
592inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
593{
594 v128_t maxval = wasm_i32x4_splat(65535);
595 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
596 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
597 ushort t_ptr[8];
598 wasm_v128_store(t_ptr, r);
599 for (int i=0; i<4; ++i) {
600 ptr[i] = t_ptr[i];
601 }
602}
603inline void v_pack_store(short* ptr, const v_int32x4& a)
604{
605 v128_t maxval = wasm_i32x4_splat(32767);
606 v128_t minval = wasm_i32x4_splat(-32768);
607 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
608 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
609 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
610 short t_ptr[8];
611 wasm_v128_store(t_ptr, r);
612 for (int i=0; i<4; ++i) {
613 ptr[i] = t_ptr[i];
614 }
615}
616inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
617{
618 v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
619 unsigned t_ptr[4];
620 wasm_v128_store(t_ptr, r);
621 for (int i=0; i<2; ++i) {
622 ptr[i] = t_ptr[i];
623 }
624}
625inline void v_pack_store(int* ptr, const v_int64x2& a)
626{
627 v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
628 int t_ptr[4];
629 wasm_v128_store(t_ptr, r);
630 for (int i=0; i<2; ++i) {
631 ptr[i] = t_ptr[i];
632 }
633}
634inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
635{
636 v128_t maxval = wasm_i16x8_splat(255);
637 v128_t minval = wasm_i16x8_splat(0);
638 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
639 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
640 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
641 uchar t_ptr[16];
642 wasm_v128_store(t_ptr, r);
643 for (int i=0; i<8; ++i) {
644 ptr[i] = t_ptr[i];
645 }
646}
647inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
648{
649 v128_t maxval = wasm_i32x4_splat(65535);
650 v128_t minval = wasm_i32x4_splat(0);
651 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
652 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
653 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
654 ushort t_ptr[8];
655 wasm_v128_store(t_ptr, r);
656 for (int i=0; i<4; ++i) {
657 ptr[i] = t_ptr[i];
658 }
659}
660
661template<int n>
662inline void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
663{
664 v128_t delta = wasm_i16x8_splat((short)(1 << (n-1)));
665 v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
666 v128_t maxval = wasm_i16x8_splat(255);
667 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
668 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
669 uchar t_ptr[16];
670 wasm_v128_store(t_ptr, r);
671 for (int i=0; i<8; ++i) {
672 ptr[i] = t_ptr[i];
673 }
674}
675template<int n>
676inline void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
677{
678 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
679 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
680 v128_t maxval = wasm_i16x8_splat(127);
681 v128_t minval = wasm_i16x8_splat(-128);
682 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
683 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
684 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
685 schar t_ptr[16];
686 wasm_v128_store(t_ptr, r);
687 for (int i=0; i<8; ++i) {
688 ptr[i] = t_ptr[i];
689 }
690}
691template<int n>
692inline void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
693{
694 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
695 v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
696 v128_t maxval = wasm_i32x4_splat(65535);
697 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
698 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
699 ushort t_ptr[8];
700 wasm_v128_store(t_ptr, r);
701 for (int i=0; i<4; ++i) {
702 ptr[i] = t_ptr[i];
703 }
704}
705template<int n>
706inline void v_rshr_pack_store(short* ptr, const v_int32x4& a)
707{
708 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
709 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
710 v128_t maxval = wasm_i32x4_splat(32767);
711 v128_t minval = wasm_i32x4_splat(-32768);
712 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
713 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
714 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
715 short t_ptr[8];
716 wasm_v128_store(t_ptr, r);
717 for (int i=0; i<4; ++i) {
718 ptr[i] = t_ptr[i];
719 }
720}
721template<int n>
722inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
723{
724 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
725 v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
726 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
727 unsigned t_ptr[4];
728 wasm_v128_store(t_ptr, r);
729 for (int i=0; i<2; ++i) {
730 ptr[i] = t_ptr[i];
731 }
732}
733template<int n>
734inline void v_rshr_pack_store(int* ptr, const v_int64x2& a)
735{
736 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
737 v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
738 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
739 int t_ptr[4];
740 wasm_v128_store(t_ptr, r);
741 for (int i=0; i<2; ++i) {
742 ptr[i] = t_ptr[i];
743 }
744}
745template<int n>
746inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
747{
748 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
749 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
750 v128_t maxval = wasm_i16x8_splat(255);
751 v128_t minval = wasm_i16x8_splat(0);
752 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
753 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
754 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
755 uchar t_ptr[16];
756 wasm_v128_store(t_ptr, r);
757 for (int i=0; i<8; ++i) {
758 ptr[i] = t_ptr[i];
759 }
760}
761template<int n>
762inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
763{
764 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
765 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
766 v128_t maxval = wasm_i32x4_splat(65535);
767 v128_t minval = wasm_i32x4_splat(0);
768 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
769 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
770 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
771 ushort t_ptr[8];
772 wasm_v128_store(t_ptr, r);
773 for (int i=0; i<4; ++i) {
774 ptr[i] = t_ptr[i];
775 }
776}
777
778inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
779{
780 v128_t maxval = wasm_i16x8_splat(255);
781 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
782 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
783 return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
784}
785
786inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
787 const v_uint32x4& c, const v_uint32x4& d)
788{
789 v128_t maxval = wasm_i32x4_splat(255);
790 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
791 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
792 v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval));
793 v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval));
794 v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
795 v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
796 return v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
797}
798
799inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
800 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
801 const v_uint64x2& g, const v_uint64x2& h)
802{
803 v128_t maxval = wasm_i32x4_splat(255);
804 v128_t a1 = wasm_v128_bitselect(maxval, a.val, ((__u64x2)(a.val) > (__u64x2)maxval));
805 v128_t b1 = wasm_v128_bitselect(maxval, b.val, ((__u64x2)(b.val) > (__u64x2)maxval));
806 v128_t c1 = wasm_v128_bitselect(maxval, c.val, ((__u64x2)(c.val) > (__u64x2)maxval));
807 v128_t d1 = wasm_v128_bitselect(maxval, d.val, ((__u64x2)(d.val) > (__u64x2)maxval));
808 v128_t e1 = wasm_v128_bitselect(maxval, e.val, ((__u64x2)(e.val) > (__u64x2)maxval));
809 v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval));
810 v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval));
811 v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval));
812 v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
813 v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
814 v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
815 v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
816 v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
817 v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
818 return v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
819}
820
821inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
822 const v_float32x4& m1, const v_float32x4& m2,
823 const v_float32x4& m3)
824{
825 v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
826 v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
827 v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
828 v128_t v3 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 3));
829 v0 = wasm_f32x4_mul(v0, m0.val);
830 v1 = wasm_f32x4_mul(v1, m1.val);
831 v2 = wasm_f32x4_mul(v2, m2.val);
832 v3 = wasm_f32x4_mul(v3, m3.val);
833
834 return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, v3)));
835}
836
837inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
838 const v_float32x4& m1, const v_float32x4& m2,
839 const v_float32x4& a)
840{
841 v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
842 v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
843 v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
844 v0 = wasm_f32x4_mul(v0, m0.val);
845 v1 = wasm_f32x4_mul(v1, m1.val);
846 v2 = wasm_f32x4_mul(v2, m2.val);
847
848 return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, a.val)));
849}
850
851#define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
852inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
853{ \
854 return _Tpvec(intrin(a.val, b.val)); \
855} \
856inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
857{ \
858 a.val = intrin(a.val, b.val); \
859 return a; \
860}
861
862OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
863OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
864OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
865OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
866OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
867OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
868OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
869OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
870OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
871OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
872OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
873OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
874OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
875OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
876OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
877OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
878OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
879OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
880OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
881OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
882OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
883OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
884OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
885OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
886OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
887OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
888
889// saturating multiply 8-bit, 16-bit
890#define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec) \
891inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
892{ \
893 _Tpwvec c, d; \
894 v_mul_expand(a, b, c, d); \
895 return v_pack(c, d); \
896} \
897inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
898{ a = a * b; return a; }
899
900OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
901OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16, v_int16x8)
902OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint16x8, v_uint32x4)
903OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int16x8, v_int32x4)
904
905// Multiply and expand
906inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
907 v_uint16x8& c, v_uint16x8& d)
908{
909 v_uint16x8 a0, a1, b0, b1;
910 v_expand(a, a0, a1);
911 v_expand(b, b0, b1);
912 c = v_mul_wrap(a0, b0);
913 d = v_mul_wrap(a1, b1);
914}
915
916inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
917 v_int16x8& c, v_int16x8& d)
918{
919 v_int16x8 a0, a1, b0, b1;
920 v_expand(a, a0, a1);
921 v_expand(b, b0, b1);
922 c = v_mul_wrap(a0, b0);
923 d = v_mul_wrap(a1, b1);
924}
925
926inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
927 v_int32x4& c, v_int32x4& d)
928{
929 v_int32x4 a0, a1, b0, b1;
930 v_expand(a, a0, a1);
931 v_expand(b, b0, b1);
932 c.val = wasm_i32x4_mul(a0.val, b0.val);
933 d.val = wasm_i32x4_mul(a1.val, b1.val);
934}
935
936inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
937 v_uint32x4& c, v_uint32x4& d)
938{
939 v_uint32x4 a0, a1, b0, b1;
940 v_expand(a, a0, a1);
941 v_expand(b, b0, b1);
942 c.val = wasm_i32x4_mul(a0.val, b0.val);
943 d.val = wasm_i32x4_mul(a1.val, b1.val);
944}
945
946inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
947 v_uint64x2& c, v_uint64x2& d)
948{
949 v_uint64x2 a0, a1, b0, b1;
950 v_expand(a, a0, a1);
951 v_expand(b, b0, b1);
952 c.val = ((__u64x2)(a0.val) * (__u64x2)(b0.val));
953 d.val = ((__u64x2)(a1.val) * (__u64x2)(b1.val));
954}
955
956inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
957{
958 v_int32x4 a0, a1, b0, b1;
959 v_expand(a, a0, a1);
960 v_expand(b, b0, b1);
961 v128_t c = wasm_i32x4_mul(a0.val, b0.val);
962 v128_t d = wasm_i32x4_mul(a1.val, b1.val);
963 return v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
964}
965inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
966{
967 v_uint32x4 a0, a1, b0, b1;
968 v_expand(a, a0, a1);
969 v_expand(b, b0, b1);
970 v128_t c = wasm_i32x4_mul(a0.val, b0.val);
971 v128_t d = wasm_i32x4_mul(a1.val, b1.val);
972 return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
973}
974
976
977inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
978{
979 v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
980 v128_t a1 = wasm_i32x4_shr(a.val, 16);
981 v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
982 v128_t b1 = wasm_i32x4_shr(b.val, 16);
983 v128_t c = wasm_i32x4_mul(a0, b0);
984 v128_t d = wasm_i32x4_mul(a1, b1);
985 return v_int32x4(wasm_i32x4_add(c, d));
986}
987
988inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
989{ return v_dotprod(a, b) + c; }
990
991inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
992{
993 v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
994 v128_t a1 = wasm_i64x2_shr(a.val, 32);
995 v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
996 v128_t b1 = wasm_i64x2_shr(b.val, 32);
997 v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
998 v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
999 return v_int64x2(wasm_i64x2_add(c, d));
1000}
1001inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1002{
1003 return v_dotprod(a, b) + c;
1004}
1005
1006// 8 >> 32
1007inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
1008{
1009 v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1010 v128_t a1 = wasm_u16x8_shr(a.val, 8);
1011 v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1012 v128_t b1 = wasm_u16x8_shr(b.val, 8);
1013 return v_uint32x4((
1014 v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
1015 v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
1016 );
1017}
1018inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1019{ return v_dotprod_expand(a, b) + c; }
1020
1021inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
1022{
1023 v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1024 v128_t a1 = wasm_i16x8_shr(a.val, 8);
1025 v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1026 v128_t b1 = wasm_i16x8_shr(b.val, 8);
1027 return v_int32x4(
1028 v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
1029 v_dotprod(v_int16x8(a1), v_int16x8(b1))
1030 );
1031}
1032inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1033{ return v_dotprod_expand(a, b) + c; }
1034
1035// 16 >> 64
1036inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
1037{
1038 v128_t a0 = wasm_u32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1039 v128_t a1 = wasm_u32x4_shr(a.val, 16);
1040 v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1041 v128_t b1 = wasm_u32x4_shr(b.val, 16);
1042 return v_uint64x2((
1043 v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
1044 v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
1045 );
1046}
1047inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1048{ return v_dotprod_expand(a, b) + c; }
1049
1050inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
1051{
1052 v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1053 v128_t a1 = wasm_i32x4_shr(a.val, 16);
1054 v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1055 v128_t b1 = wasm_i32x4_shr(b.val, 16);
1056 return v_int64x2((
1057 v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
1058 v_dotprod(v_int32x4(a1), v_int32x4(b1)))
1059 );
1060}
1061
1062inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1063{ return v_dotprod_expand(a, b) + c; }
1064
1065// 32 >> 64f
1066inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
1067{ return v_cvt_f64(v_dotprod(a, b)); }
1068inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1069{ return v_dotprod_expand(a, b) + c; }
1070
1072
1073// 16 >> 32
1074inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
1075{ return v_dotprod(a, b); }
1076inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1077{ return v_dotprod(a, b, c); }
1078
1079// 32 >> 64
1080inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
1081{ return v_dotprod(a, b); }
1082inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1083{ return v_dotprod(a, b, c); }
1084
1085// 8 >> 32
1086inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1087{ return v_dotprod_expand(a, b); }
1088inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1089{ return v_dotprod_expand(a, b, c); }
1090inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1091{ return v_dotprod_expand(a, b); }
1092inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1093{ return v_dotprod_expand(a, b, c); }
1094
1095// 16 >> 64
1096inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1097{ return v_dotprod_expand(a, b); }
1098inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1099{ return v_dotprod_expand(a, b, c); }
1100inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1101{ return v_dotprod_expand(a, b); }
1102inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1103{ return v_dotprod_expand(a, b, c); }
1104
1105// 32 >> 64f
1106inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1107{ return v_dotprod_expand(a, b); }
1108inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1109{ return v_dotprod_expand(a, b, c); }
1110
1111#define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
1112OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
1113OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
1114OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
1115inline _Tpvec operator ~ (const _Tpvec& a) \
1116{ \
1117 return _Tpvec(wasm_v128_not(a.val)); \
1118}
1119
1120OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint8x16)
1121OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16)
1122OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint16x8)
1123OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int16x8)
1124OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint32x4)
1125OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int32x4)
1126OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint64x2)
1127OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int64x2)
1128OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float32x4)
1129OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float64x2)
1130
1131inline v_float32x4 v_sqrt(const v_float32x4& x)
1132{
1133 return v_float32x4(wasm_f32x4_sqrt(x.val));
1134}
1135
1136inline v_float32x4 v_invsqrt(const v_float32x4& x)
1137{
1138 const v128_t _1_0 = wasm_f32x4_splat(1.0);
1139 return v_float32x4(wasm_f32x4_div(_1_0, wasm_f32x4_sqrt(x.val)));
1140}
1141
1142inline v_float64x2 v_sqrt(const v_float64x2& x)
1143{
1144 return v_float64x2(wasm_f64x2_sqrt(x.val));
1145}
1146
1147inline v_float64x2 v_invsqrt(const v_float64x2& x)
1148{
1149 const v128_t _1_0 = wasm_f64x2_splat(1.0);
1150 return v_float64x2(wasm_f64x2_div(_1_0, wasm_f64x2_sqrt(x.val)));
1151}
1152
1153#define OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(_Tpuvec, _Tpsvec, suffix, zsuffix, shiftWidth) \
1154inline _Tpuvec v_abs(const _Tpsvec& x) \
1155{ \
1156 v128_t s = wasm_##suffix##_shr(x.val, shiftWidth); \
1157 v128_t f = wasm_##zsuffix##_shr(x.val, shiftWidth); \
1158 return _Tpuvec(wasm_##zsuffix##_add(wasm_v128_xor(x.val, f), s)); \
1159}
1160
1161OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint8x16, v_int8x16, u8x16, i8x16, 7)
1162OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint16x8, v_int16x8, u16x8, i16x8, 15)
1163OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint32x4, v_int32x4, u32x4, i32x4, 31)
1164
1165inline v_float32x4 v_abs(const v_float32x4& x)
1166{ return v_float32x4(wasm_f32x4_abs(x.val)); }
1167inline v_float64x2 v_abs(const v_float64x2& x)
1168{
1169 return v_float64x2(wasm_f64x2_abs(x.val));
1170}
1171
1172// TODO: exp, log, sin, cos
1173
1174#define OPENCV_HAL_IMPL_WASM_BIN_FUNC(_Tpvec, func, intrin) \
1175inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1176{ \
1177 return _Tpvec(intrin(a.val, b.val)); \
1178}
1179
1180OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min)
1181OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max)
1182OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min)
1183OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_max, wasm_f64x2_max)
1184
1185#define OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(_Tpvec, suffix) \
1186inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1187{ \
1188 return _Tpvec(wasm_v128_bitselect(b.val, a.val, wasm_##suffix##_gt(a.val, b.val))); \
1189} \
1190inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1191{ \
1192 return _Tpvec(wasm_v128_bitselect(a.val, b.val, wasm_##suffix##_gt(a.val, b.val))); \
1193}
1194
1195OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int8x16, i8x16)
1196OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int16x8, i16x8)
1197OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int32x4, i32x4)
1198
1199#define OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(_Tpvec, suffix, deltaNum) \
1200inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1201{ \
1202 v128_t delta = wasm_##suffix##_splat(deltaNum); \
1203 v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1204 return _Tpvec(wasm_v128_bitselect(b.val, a.val, mask)); \
1205} \
1206inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1207{ \
1208 v128_t delta = wasm_##suffix##_splat(deltaNum); \
1209 v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1210 return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask)); \
1211}
1212
1213OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint8x16, i8x16, (schar)0x80)
1214OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
1215OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
1216
1217#define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
1218inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1219{ return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
1220inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1221{ return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
1222inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1223{ return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
1224inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1225{ return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
1226inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1227{ return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
1228inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1229{ return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
1230
1231OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
1232OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int8x16, i8x16, i8x16)
1233OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint16x8, u16x8, i16x8)
1234OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int16x8, i16x8, i16x8)
1235OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint32x4, u32x4, i32x4)
1236OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int32x4, i32x4, i32x4)
1237OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
1238OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
1239
1240#define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
1241inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1242{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
1243inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1244{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
1245
1246OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
1247OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
1248
1249inline v_float32x4 v_not_nan(const v_float32x4& a)
1250{
1251 v128_t z = wasm_i32x4_splat(0x7fffffff);
1252 v128_t t = wasm_i32x4_splat(0x7f800000);
1253 return v_float32x4(wasm_u32x4_lt(wasm_v128_and(a.val, z), t));
1254}
1255inline v_float64x2 v_not_nan(const v_float64x2& a)
1256{
1257 v128_t z = wasm_i64x2_splat(0x7fffffffffffffff);
1258 v128_t t = wasm_i64x2_splat(0x7ff0000000000000);
1259 return v_float64x2((__u64x2)(wasm_v128_and(a.val, z)) < (__u64x2)t);
1260}
1261
1262OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_add_wrap, wasm_i8x16_add)
1263OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_add_wrap, wasm_i8x16_add)
1264OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_add_wrap, wasm_i16x8_add)
1265OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_add_wrap, wasm_i16x8_add)
1266OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_sub_wrap, wasm_i8x16_sub)
1267OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_sub_wrap, wasm_i8x16_sub)
1268OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
1269OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
1270#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012)
1271// details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
1272// 1.39.12: https://github.com/emscripten-core/emscripten/commit/cd801d0f110facfd694212a3c8b2ed2ffcd630e2
1273inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
1274{
1275 uchar a_[16], b_[16];
1276 wasm_v128_store(a_, a.val);
1277 wasm_v128_store(b_, b.val);
1278 for (int i = 0; i < 16; i++)
1279 a_[i] = (uchar)(a_[i] * b_[i]);
1280 return v_uint8x16(wasm_v128_load(a_));
1281}
1282inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
1283{
1284 schar a_[16], b_[16];
1285 wasm_v128_store(a_, a.val);
1286 wasm_v128_store(b_, b.val);
1287 for (int i = 0; i < 16; i++)
1288 a_[i] = (schar)(a_[i] * b_[i]);
1289 return v_int8x16(wasm_v128_load(a_));
1290}
1291#else
1292OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
1293OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
1294#endif
1295OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_mul_wrap, wasm_i16x8_mul)
1296OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
1297
1298
1299
1301inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1302{ return v_add_wrap(a - b, b - a); }
1303inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1304{ return v_add_wrap(a - b, b - a); }
1305inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1306{ return v_max(a, b) - v_min(a, b); }
1307
1308inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1309{
1310 v_int8x16 d = v_sub_wrap(a, b);
1311 v_int8x16 m = a < b;
1312 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1313}
1314inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1315{
1316 return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1317}
1318inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1319{
1320 v_int32x4 d = a - b;
1321 v_int32x4 m = a < b;
1322 return v_reinterpret_as_u32((d ^ m) - m);
1323}
1324
1326inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1327{
1328 v_int8x16 d = a - b;
1329 v_int8x16 m = a < b;
1330 return (d ^ m) - m;
1331 }
1332inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1333{ return v_max(a, b) - v_min(a, b); }
1334
1335
1336inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1337{
1338 return a * b + c;
1339}
1340
1341inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1342{
1343 return v_fma(a, b, c);
1344}
1345
1346inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1347{
1348 return a * b + c;
1349}
1350
1351inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1352{
1353 return a * b + c;
1354}
1355
1356inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
1357{
1358 v128_t absmask_vec = wasm_i32x4_splat(0x7fffffff);
1359 return v_float32x4(wasm_v128_and(wasm_f32x4_sub(a.val, b.val), absmask_vec));
1360}
1361inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
1362{
1363 v128_t absmask_vec = wasm_u64x2_shr(wasm_i32x4_splat(-1), 1);
1364 return v_float64x2(wasm_v128_and(wasm_f64x2_sub(a.val, b.val), absmask_vec));
1365}
1366
1367#define OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(_Tpvec, suffix) \
1368inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1369{ \
1370 v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1371 v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1372 return _Tpvec(wasm_##suffix##_sqrt(wasm_##suffix##_add(a_Square, b_Square))); \
1373} \
1374inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1375{ \
1376 v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1377 v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1378 return _Tpvec(wasm_##suffix##_add(a_Square, b_Square)); \
1379} \
1380inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1381{ \
1382 return _Tpvec(wasm_##suffix##_add(wasm_##suffix##_mul(a.val, b.val), c.val)); \
1383}
1384
1385OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
1386OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
1387
1388#define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
1389inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1390{ \
1391 return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1392} \
1393inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1394{ \
1395 return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1396} \
1397inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1398{ \
1399 return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1400} \
1401inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1402{ \
1403 return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1404} \
1405template<int imm> \
1406inline _Tpuvec v_shl(const _Tpuvec& a) \
1407{ \
1408 return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1409} \
1410template<int imm> \
1411inline _Tpsvec v_shl(const _Tpsvec& a) \
1412{ \
1413 return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1414} \
1415template<int imm> \
1416inline _Tpuvec v_shr(const _Tpuvec& a) \
1417{ \
1418 return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1419} \
1420template<int imm> \
1421inline _Tpsvec v_shr(const _Tpsvec& a) \
1422{ \
1423 return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1424}
1425
1426OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint8x16, v_int8x16, i8x16, u8x16)
1427OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint16x8, v_int16x8, i16x8, u16x8)
1428OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint32x4, v_int32x4, i32x4, u32x4)
1429OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint64x2, v_int64x2, i64x2, u64x2)
1430
1431namespace hal_wasm_internal
1432{
1433 template <int imm,
1434 bool is_invalid = ((imm < 0) || (imm > 16)),
1435 bool is_first = (imm == 0),
1436 bool is_second = (imm == 16),
1437 bool is_other = (((imm > 0) && (imm < 16)))>
1438 class v_wasm_palignr_u8_class;
1439
1440 template <int imm>
1441 class v_wasm_palignr_u8_class<imm, true, false, false, false>;
1442
1443 template <int imm>
1444 class v_wasm_palignr_u8_class<imm, false, true, false, false>
1445 {
1446 public:
1447 inline v128_t operator()(const v128_t& a, const v128_t&) const
1448 {
1449 return a;
1450 }
1451 };
1452
1453 template <int imm>
1454 class v_wasm_palignr_u8_class<imm, false, false, true, false>
1455 {
1456 public:
1457 inline v128_t operator()(const v128_t&, const v128_t& b) const
1458 {
1459 return b;
1460 }
1461 };
1462
1463 template <int imm>
1464 class v_wasm_palignr_u8_class<imm, false, false, false, true>
1465 {
1466 public:
1467 inline v128_t operator()(const v128_t& a, const v128_t& b) const
1468 {
1469 enum { imm2 = (sizeof(v128_t) - imm) };
1470 return wasm_v8x16_shuffle(a, b,
1471 imm, imm+1, imm+2, imm+3,
1472 imm+4, imm+5, imm+6, imm+7,
1473 imm+8, imm+9, imm+10, imm+11,
1474 imm+12, imm+13, imm+14, imm+15);
1475 }
1476 };
1477
1478 template <int imm>
1479 inline v128_t v_wasm_palignr_u8(const v128_t& a, const v128_t& b)
1480 {
1481 CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_wasm_palignr_u8.");
1482 return v_wasm_palignr_u8_class<imm>()(a, b);
1483 }
1484}
1485
1486template<int imm, typename _Tpvec>
1487inline _Tpvec v_rotate_right(const _Tpvec &a)
1488{
1489 using namespace hal_wasm_internal;
1490 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1491 v128_t z = wasm_i8x16_splat(0);
1492 return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, z));
1493}
1494
1495template<int imm, typename _Tpvec>
1496inline _Tpvec v_rotate_left(const _Tpvec &a)
1497{
1498 using namespace hal_wasm_internal;
1499 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1500 v128_t z = wasm_i8x16_splat(0);
1501 return _Tpvec(v_wasm_palignr_u8<imm2>(z, a.val));
1502}
1503
1504template<int imm, typename _Tpvec>
1505inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1506{
1507 using namespace hal_wasm_internal;
1508 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1509 return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, b.val));
1510}
1511
1512template<int imm, typename _Tpvec>
1513inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1514{
1515 using namespace hal_wasm_internal;
1516 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1517 return _Tpvec(v_wasm_palignr_u8<imm2>(b.val, a.val));
1518}
1519
1520#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1521inline _Tpvec v_load(const _Tp* ptr) \
1522{ return _Tpvec(wasm_v128_load(ptr)); } \
1523inline _Tpvec v_load_aligned(const _Tp* ptr) \
1524{ return _Tpvec(wasm_v128_load(ptr)); } \
1525inline _Tpvec v_load_low(const _Tp* ptr) \
1526{ \
1527 _Tp tmp[_Tpvec::nlanes] = {0}; \
1528 for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1529 tmp[i] = ptr[i]; \
1530 } \
1531 return _Tpvec(wasm_v128_load(tmp)); \
1532} \
1533inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1534{ \
1535 _Tp tmp[_Tpvec::nlanes]; \
1536 for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1537 tmp[i] = ptr0[i]; \
1538 tmp[i+_Tpvec::nlanes/2] = ptr1[i]; \
1539 } \
1540 return _Tpvec(wasm_v128_load(tmp)); \
1541} \
1542inline void v_store(_Tp* ptr, const _Tpvec& a) \
1543{ wasm_v128_store(ptr, a.val); } \
1544inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1545{ wasm_v128_store(ptr, a.val); } \
1546inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1547{ wasm_v128_store(ptr, a.val); } \
1548inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1549{ \
1550 wasm_v128_store(ptr, a.val); \
1551} \
1552inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1553{ \
1554 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1555 wasm_v128_store(a_, a.val); \
1556 for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1557 ptr[i] = a_[i]; \
1558} \
1559inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1560{ \
1561 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1562 wasm_v128_store(a_, a.val); \
1563 for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1564 ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
1565}
1566
1567OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
1568OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int8x16, schar)
1569OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint16x8, ushort)
1570OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int16x8, short)
1571OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1572OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int32x4, int)
1573OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint64x2, uint64)
1574OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int64x2, int64)
1575OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float)
1576OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)
1577
1578
1579
1580inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1581{ return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
1582
1583inline v_int8x16 v_reverse(const v_int8x16 &a)
1584{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1585
1586inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1587{ return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
1588
1589inline v_int16x8 v_reverse(const v_int16x8 &a)
1590{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1591
1592inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1593{ return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
1594
1595inline v_int32x4 v_reverse(const v_int32x4 &a)
1596{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1597
1598inline v_float32x4 v_reverse(const v_float32x4 &a)
1599{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1600
1601inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1602{ return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
1603
1604inline v_int64x2 v_reverse(const v_int64x2 &a)
1605{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1606
1607inline v_float64x2 v_reverse(const v_float64x2 &a)
1608{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1609
1610
1611#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1612inline scalartype v_reduce_sum(const _Tpvec& a) \
1613{ \
1614 regtype val = a.val; \
1615 val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1616 val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
1617 return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1618}
1619
1620OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4, unsigned, v128_t, i32x4, i32x4)
1621OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_int32x4, int, v128_t, i32x4, i32x4)
1622OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
1623
1624// To do: Optimize v_reduce_sum with wasm intrin.
1625// Now use fallback implementation as there is no widening op in wasm intrin.
1626
1627#define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
1628inline scalartype v_reduce_sum(const _Tpvec& a) \
1629{ \
1630 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1631 wasm_v128_store(a_, a.val); \
1632 scalartype c = a_[0]; \
1633 for (int i = 1; i < _Tpvec::nlanes; i++) \
1634 c += a_[i]; \
1635 return c; \
1636}
1637
1638OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
1639OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int8x16, int)
1640OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint16x8, unsigned)
1641OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int16x8, int)
1642
1643
1644#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1645inline scalartype v_reduce_sum(const _Tpvec& a) \
1646{ \
1647 regtype val = a.val; \
1648 val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1649 return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1650}
1651OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_uint64x2, uint64, v128_t, i64x2, i64x2)
1652OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_int64x2, int64, v128_t, i64x2, i64x2)
1653OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_float64x2, double, v128_t, f64x2,f64x2)
1654
1655inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1656 const v_float32x4& c, const v_float32x4& d)
1657{
1658 v128_t ac = wasm_f32x4_add(wasm_unpacklo_i32x4(a.val, c.val), wasm_unpackhi_i32x4(a.val, c.val));
1659 v128_t bd = wasm_f32x4_add(wasm_unpacklo_i32x4(b.val, d.val), wasm_unpackhi_i32x4(b.val, d.val));
1660 return v_float32x4(wasm_f32x4_add(wasm_unpacklo_i32x4(ac, bd), wasm_unpackhi_i32x4(ac, bd)));
1661}
1662
1663#define OPENCV_HAL_IMPL_WASM_REDUCE_OP(_Tpvec, scalartype, func, scalar_func) \
1664inline scalartype v_reduce_##func(const _Tpvec& a) \
1665{ \
1666 scalartype buf[_Tpvec::nlanes]; \
1667 v_store(buf, a); \
1668 scalartype tmp = buf[0]; \
1669 for (int i=1; i<_Tpvec::nlanes; ++i) { \
1670 tmp = scalar_func(tmp, buf[i]); \
1671 } \
1672 return tmp; \
1673}
1674
1675OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, max, std::max)
1676OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, min, std::min)
1677OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, max, std::max)
1678OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, min, std::min)
1679OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, max, std::max)
1680OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, min, std::min)
1681OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, max, std::max)
1682OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, min, std::min)
1683OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, max, std::max)
1684OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, min, std::min)
1685OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, max, std::max)
1686OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, min, std::min)
1687OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, max, std::max)
1688OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, min, std::min)
1689
1690inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1691{
1692 v_uint16x8 l16, h16;
1693 v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
1694 v_expand(v_absdiff(a, b), l16, h16);
1695 v_expand(l16, l16_l32, l16_h32);
1696 v_expand(h16, h16_l32, h16_h32);
1697 return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
1698}
1699inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1700{
1701 v_uint16x8 l16, h16;
1702 v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
1703 v_expand(v_absdiff(a, b), l16, h16);
1704 v_expand(l16, l16_l32, l16_h32);
1705 v_expand(h16, h16_l32, h16_h32);
1706 return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
1707}
1708inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1709{
1710 v_uint32x4 l, h;
1711 v_expand(v_absdiff(a, b), l, h);
1712 return v_reduce_sum(l + h);
1713}
1714inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1715{
1716 v_uint32x4 l, h;
1717 v_expand(v_absdiff(a, b), l, h);
1718 return v_reduce_sum(l + h);
1719}
1720inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1721{
1722 return v_reduce_sum(v_absdiff(a, b));
1723}
1724inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1725{
1726 return v_reduce_sum(v_absdiff(a, b));
1727}
1728inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1729{
1730 return v_reduce_sum(v_absdiff(a, b));
1731}
1732
1733inline v_uint8x16 v_popcount(const v_uint8x16& a)
1734{
1735 v128_t m1 = wasm_i32x4_splat(0x55555555);
1736 v128_t m2 = wasm_i32x4_splat(0x33333333);
1737 v128_t m4 = wasm_i32x4_splat(0x0f0f0f0f);
1738 v128_t p = a.val;
1739 p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 1), m1), wasm_v128_and(p, m1));
1740 p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 2), m2), wasm_v128_and(p, m2));
1741 p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 4), m4), wasm_v128_and(p, m4));
1742 return v_uint8x16(p);
1743}
1744inline v_uint16x8 v_popcount(const v_uint16x8& a)
1745{
1746 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1747 p += v_rotate_right<1>(p);
1748 return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1749}
1750inline v_uint32x4 v_popcount(const v_uint32x4& a)
1751{
1752 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1753 p += v_rotate_right<1>(p);
1754 p += v_rotate_right<2>(p);
1755 return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1756}
1757inline v_uint64x2 v_popcount(const v_uint64x2& a)
1758{
1759 uint64 a_[2], b_[2] = { 0 };
1760 wasm_v128_store(a_, a.val);
1761 for (int i = 0; i < 16; i++)
1762 b_[i / 8] += popCountTable[((uint8_t*)a_)[i]];
1763 return v_uint64x2(wasm_v128_load(b_));
1764}
1765inline v_uint8x16 v_popcount(const v_int8x16& a)
1766{ return v_popcount(v_reinterpret_as_u8(a)); }
1767inline v_uint16x8 v_popcount(const v_int16x8& a)
1768{ return v_popcount(v_reinterpret_as_u16(a)); }
1769inline v_uint32x4 v_popcount(const v_int32x4& a)
1770{ return v_popcount(v_reinterpret_as_u32(a)); }
1771inline v_uint64x2 v_popcount(const v_int64x2& a)
1772{ return v_popcount(v_reinterpret_as_u64(a)); }
1773
1774#define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
1775inline int v_signmask(const _Tpvec& a) \
1776{ \
1777 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1778 wasm_v128_store(a_, a.val); \
1779 int mask = 0; \
1780 for (int i = 0; i < _Tpvec::nlanes; i++) \
1781 mask |= (reinterpret_int(a_[i]) < 0) << i; \
1782 return mask; \
1783} \
1784inline bool v_check_all(const _Tpvec& a) \
1785{ return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
1786inline bool v_check_any(const _Tpvec& a) \
1787{ return wasm_i8x16_any_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0)));; }
1788
1789OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16, i8x16, schar)
1790OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int8x16, i8x16, schar)
1791OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint16x8, i16x8, short)
1792OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int16x8, i16x8, short)
1793OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint32x4, i32x4, int)
1794OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int32x4, i32x4, int)
1795OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float32x4, i32x4, float)
1796OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float64x2, f64x2, double)
1797
1798#define OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(_Tpvec, suffix, esuffix) \
1799inline bool v_check_all(const _Tpvec& a) \
1800{ \
1801 v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1802 masked = wasm_i32x4_replace_lane(masked, 0, 0xffffffff); \
1803 masked = wasm_i32x4_replace_lane(masked, 2, 0xffffffff); \
1804 return wasm_i8x16_all_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1805} \
1806inline bool v_check_any(const _Tpvec& a) \
1807{ \
1808 v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1809 masked = wasm_i32x4_replace_lane(masked, 0, 0x0); \
1810 masked = wasm_i32x4_replace_lane(masked, 2, 0x0); \
1811 return wasm_i8x16_any_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1812} \
1813
1814OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_int64x2, i32x4, s32)
1815OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_uint64x2, i32x4, u32)
1816
1817
1818inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1819inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1820inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1821inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1822inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1823inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1824inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1825inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1826inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1827inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1828
1829#define OPENCV_HAL_IMPL_WASM_SELECT(_Tpvec) \
1830inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1831{ \
1832 return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask.val)); \
1833}
1834
1835OPENCV_HAL_IMPL_WASM_SELECT(v_uint8x16)
1836OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16)
1837OPENCV_HAL_IMPL_WASM_SELECT(v_uint16x8)
1838OPENCV_HAL_IMPL_WASM_SELECT(v_int16x8)
1839OPENCV_HAL_IMPL_WASM_SELECT(v_uint32x4)
1840OPENCV_HAL_IMPL_WASM_SELECT(v_int32x4)
1841OPENCV_HAL_IMPL_WASM_SELECT(v_uint64x2)
1842OPENCV_HAL_IMPL_WASM_SELECT(v_int64x2)
1843OPENCV_HAL_IMPL_WASM_SELECT(v_float32x4)
1844OPENCV_HAL_IMPL_WASM_SELECT(v_float64x2)
1845
1846#define OPENCV_HAL_IMPL_WASM_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1847inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1848{ \
1849 b0.val = intrin(a.val); \
1850 b1.val = __CV_CAT(intrin, _high)(a.val); \
1851} \
1852inline _Tpwvec v_expand_low(const _Tpvec& a) \
1853{ return _Tpwvec(intrin(a.val)); } \
1854inline _Tpwvec v_expand_high(const _Tpvec& a) \
1855{ return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1856inline _Tpwvec v_load_expand(const _Tp* ptr) \
1857{ \
1858 v128_t a = wasm_v128_load(ptr); \
1859 return _Tpwvec(intrin(a)); \
1860}
1861
1862OPENCV_HAL_IMPL_WASM_EXPAND(v_uint8x16, v_uint16x8, uchar, v128_cvtu8x16_i16x8)
1863OPENCV_HAL_IMPL_WASM_EXPAND(v_int8x16, v_int16x8, schar, v128_cvti8x16_i16x8)
1864OPENCV_HAL_IMPL_WASM_EXPAND(v_uint16x8, v_uint32x4, ushort, v128_cvtu16x8_i32x4)
1865OPENCV_HAL_IMPL_WASM_EXPAND(v_int16x8, v_int32x4, short, v128_cvti16x8_i32x4)
1866OPENCV_HAL_IMPL_WASM_EXPAND(v_uint32x4, v_uint64x2, unsigned, v128_cvtu32x4_i64x2)
1867OPENCV_HAL_IMPL_WASM_EXPAND(v_int32x4, v_int64x2, int, v128_cvti32x4_i64x2)
1868
1869#define OPENCV_HAL_IMPL_WASM_EXPAND_Q(_Tpvec, _Tp, intrin) \
1870inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1871{ \
1872 v128_t a = wasm_v128_load(ptr); \
1873 return _Tpvec(intrin(a)); \
1874}
1875
1876OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_uint32x4, uchar, v128_cvtu8x16_i32x4)
1877OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_int32x4, schar, v128_cvti8x16_i32x4)
1878
1879#define OPENCV_HAL_IMPL_WASM_UNPACKS(_Tpvec, suffix) \
1880inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1881{ \
1882 b0.val = wasm_unpacklo_##suffix(a0.val, a1.val); \
1883 b1.val = wasm_unpackhi_##suffix(a0.val, a1.val); \
1884} \
1885inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1886{ \
1887 return _Tpvec(wasm_unpacklo_i64x2(a.val, b.val)); \
1888} \
1889inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1890{ \
1891 return _Tpvec(wasm_unpackhi_i64x2(a.val, b.val)); \
1892} \
1893inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1894{ \
1895 c.val = wasm_unpacklo_i64x2(a.val, b.val); \
1896 d.val = wasm_unpackhi_i64x2(a.val, b.val); \
1897}
1898
1899OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint8x16, i8x16)
1900OPENCV_HAL_IMPL_WASM_UNPACKS(v_int8x16, i8x16)
1901OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint16x8, i16x8)
1902OPENCV_HAL_IMPL_WASM_UNPACKS(v_int16x8, i16x8)
1903OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint32x4, i32x4)
1904OPENCV_HAL_IMPL_WASM_UNPACKS(v_int32x4, i32x4)
1905OPENCV_HAL_IMPL_WASM_UNPACKS(v_float32x4, i32x4)
1906OPENCV_HAL_IMPL_WASM_UNPACKS(v_float64x2, i64x2)
1907
1908template<int s, typename _Tpvec>
1909inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
1910{
1911 return v_rotate_right<s>(a, b);
1912}
1913
1914inline v_int32x4 v_round(const v_float32x4& a)
1915{
1916 v128_t h = wasm_f32x4_splat(0.5);
1917 return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
1918}
1919
1920inline v_int32x4 v_floor(const v_float32x4& a)
1921{
1922 v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1923 v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
1924 return v_int32x4(wasm_i32x4_add(a1, mask));
1925}
1926
1927inline v_int32x4 v_ceil(const v_float32x4& a)
1928{
1929 v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1930 v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
1931 return v_int32x4(wasm_i32x4_sub(a1, mask));
1932}
1933
1934inline v_int32x4 v_trunc(const v_float32x4& a)
1935{ return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
1936
1937#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
1938inline v_int32x4 func(const v_float64x2& a) \
1939{ \
1940 double a_[2]; \
1941 wasm_v128_store(a_, a.val); \
1942 int c_[4]; \
1943 c_[0] = cfunc(a_[0]); \
1944 c_[1] = cfunc(a_[1]); \
1945 c_[2] = 0; \
1946 c_[3] = 0; \
1947 return v_int32x4(wasm_v128_load(c_)); \
1948}
1949
1950OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
1951OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
1952OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
1953OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
1954
1955inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1956{
1957 double a_[2], b_[2];
1958 wasm_v128_store(a_, a.val);
1959 wasm_v128_store(b_, b.val);
1960 int c_[4];
1961 c_[0] = cvRound(a_[0]);
1962 c_[1] = cvRound(a_[1]);
1963 c_[2] = cvRound(b_[0]);
1964 c_[3] = cvRound(b_[1]);
1965 return v_int32x4(wasm_v128_load(c_));
1966}
1967
1968#define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
1969inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1970 const _Tpvec& a2, const _Tpvec& a3, \
1971 _Tpvec& b0, _Tpvec& b1, \
1972 _Tpvec& b2, _Tpvec& b3) \
1973{ \
1974 v128_t t0 = wasm_unpacklo_##suffix(a0.val, a1.val); \
1975 v128_t t1 = wasm_unpacklo_##suffix(a2.val, a3.val); \
1976 v128_t t2 = wasm_unpackhi_##suffix(a0.val, a1.val); \
1977 v128_t t3 = wasm_unpackhi_##suffix(a2.val, a3.val); \
1978\
1979 b0.val = wasm_unpacklo_i64x2(t0, t1); \
1980 b1.val = wasm_unpackhi_i64x2(t0, t1); \
1981 b2.val = wasm_unpacklo_i64x2(t2, t3); \
1982 b3.val = wasm_unpackhi_i64x2(t2, t3); \
1983}
1984
1985OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4, i32x4)
1986OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_int32x4, i32x4)
1987OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_float32x4, i32x4)
1988
1989// load deinterleave
1990inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1991{
1992 v128_t t00 = wasm_v128_load(ptr);
1993 v128_t t01 = wasm_v128_load(ptr + 16);
1994
1995 a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
1996 b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
1997}
1998
1999inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
2000{
2001 v128_t t00 = wasm_v128_load(ptr);
2002 v128_t t01 = wasm_v128_load(ptr + 16);
2003 v128_t t02 = wasm_v128_load(ptr + 32);
2004
2005 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
2006 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
2007 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
2008
2009 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
2010 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
2011 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
2012}
2013
2014inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2015{
2016 v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
2017 v128_t u1 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
2018 v128_t u2 = wasm_v128_load(ptr + 32); // a8 b8 c8 d8 ...
2019 v128_t u3 = wasm_v128_load(ptr + 48); // a12 b12 c12 d12 ...
2020
2021 v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2022 v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2023 v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2024 v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2025
2026 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2027 b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2028 c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2029 d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2030}
2031
2032inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2033{
2034 v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1 a2 b2 a3 b3
2035 v128_t v1 = wasm_v128_load(ptr + 8); // a4 b4 a5 b5 a6 b6 a7 b7
2036
2037 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7
2038 b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7
2039}
2040
2041inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2042{
2043 v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1 b1 c1 a2 b2
2044 v128_t t01 = wasm_v128_load(ptr + 8); // c2 a3 b3 c3 a4 b4 c4 a5
2045 v128_t t02 = wasm_v128_load(ptr + 16); // b5 c5 a6 b6 c6 a7 b7 c7
2046
2047 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
2048 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
2049 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
2050
2051 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
2052 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
2053 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
2054}
2055
2056inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2057{
2058 v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1
2059 v128_t u1 = wasm_v128_load(ptr + 8); // a2 b2 c2 d2 ...
2060 v128_t u2 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
2061 v128_t u3 = wasm_v128_load(ptr + 24); // a6 b6 c6 d6 ...
2062
2063 v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3
2064 v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7
2065 v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3
2066 v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7
2067
2068 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2069 b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2070 c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2071 d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2072}
2073
2074inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2075{
2076 v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1
2077 v128_t v1 = wasm_v128_load(ptr + 4); // a2 b2 a3 b3
2078
2079 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
2080 b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
2081}
2082
2083inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2084{
2085 v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1
2086 v128_t t01 = wasm_v128_load(ptr + 4); // b2 c2 a3 b3
2087 v128_t t02 = wasm_v128_load(ptr + 8); // c3 a4 b4 c4
2088
2089 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2090 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2091 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2092
2093 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2094 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2095 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2096}
2097
2098inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2099{
2100 v_uint32x4 s0(wasm_v128_load(ptr)); // a0 b0 c0 d0
2101 v_uint32x4 s1(wasm_v128_load(ptr + 4)); // a1 b1 c1 d1
2102 v_uint32x4 s2(wasm_v128_load(ptr + 8)); // a2 b2 c2 d2
2103 v_uint32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
2104
2105 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2106}
2107
2108inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
2109{
2110 v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1
2111 v128_t v1 = wasm_v128_load((ptr + 4)); // a2 b2 a3 b3
2112
2113 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
2114 b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
2115}
2116
2117inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
2118{
2119 v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1
2120 v128_t t01 = wasm_v128_load(ptr + 4); // b2 c2 a3 b3
2121 v128_t t02 = wasm_v128_load(ptr + 8); // c3 a4 b4 c4
2122
2123 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2124 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2125 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2126
2127 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2128 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2129 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2130}
2131
2132inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
2133{
2134 v_float32x4 s0(wasm_v128_load(ptr)); // a0 b0 c0 d0
2135 v_float32x4 s1(wasm_v128_load(ptr + 4)); // a1 b1 c1 d1
2136 v_float32x4 s2(wasm_v128_load(ptr + 8)); // a2 b2 c2 d2
2137 v_float32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
2138
2139 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2140}
2141
2142inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
2143{
2144 v128_t t0 = wasm_v128_load(ptr); // a0 b0
2145 v128_t t1 = wasm_v128_load(ptr + 2); // a1 b1
2146
2147 a.val = wasm_unpacklo_i64x2(t0, t1);
2148 b.val = wasm_unpackhi_i64x2(t0, t1);
2149}
2150
2151inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2152{
2153 v128_t t0 = wasm_v128_load(ptr); // a0, b0
2154 v128_t t1 = wasm_v128_load(ptr + 2); // c0, a1
2155 v128_t t2 = wasm_v128_load(ptr + 4); // b1, c1
2156
2157 a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2158 b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
2159 c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2160}
2161
2162inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
2163 v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2164{
2165 v128_t t0 = wasm_v128_load(ptr); // a0 b0
2166 v128_t t1 = wasm_v128_load(ptr + 2); // c0 d0
2167 v128_t t2 = wasm_v128_load(ptr + 4); // a1 b1
2168 v128_t t3 = wasm_v128_load(ptr + 6); // c1 d1
2169
2170 a.val = wasm_unpacklo_i64x2(t0, t2);
2171 b.val = wasm_unpackhi_i64x2(t0, t2);
2172 c.val = wasm_unpacklo_i64x2(t1, t3);
2173 d.val = wasm_unpackhi_i64x2(t1, t3);
2174}
2175
2176// store interleave
2177
2178inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2180{
2181 v128_t v0 = wasm_unpacklo_i8x16(a.val, b.val);
2182 v128_t v1 = wasm_unpackhi_i8x16(a.val, b.val);
2183
2184 wasm_v128_store(ptr, v0);
2185 wasm_v128_store(ptr + 16, v1);
2186}
2187
2188inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2189 const v_uint8x16& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2190{
2191 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
2192 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
2193 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
2194
2195 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
2196 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
2197 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
2198
2199 wasm_v128_store(ptr, t10);
2200 wasm_v128_store(ptr + 16, t11);
2201 wasm_v128_store(ptr + 32, t12);
2202}
2203
2204inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2205 const v_uint8x16& c, const v_uint8x16& d,
2207{
2208 // a0 a1 a2 a3 ....
2209 // b0 b1 b2 b3 ....
2210 // c0 c1 c2 c3 ....
2211 // d0 d1 d2 d3 ....
2212 v128_t u0 = wasm_unpacklo_i8x16(a.val, c.val); // a0 c0 a1 c1 ...
2213 v128_t u1 = wasm_unpackhi_i8x16(a.val, c.val); // a8 c8 a9 c9 ...
2214 v128_t u2 = wasm_unpacklo_i8x16(b.val, d.val); // b0 d0 b1 d1 ...
2215 v128_t u3 = wasm_unpackhi_i8x16(b.val, d.val); // b8 d8 b9 d9 ...
2216
2217 v128_t v0 = wasm_unpacklo_i8x16(u0, u2); // a0 b0 c0 d0 ...
2218 v128_t v1 = wasm_unpackhi_i8x16(u0, u2); // a4 b4 c4 d4 ...
2219 v128_t v2 = wasm_unpacklo_i8x16(u1, u3); // a8 b8 c8 d8 ...
2220 v128_t v3 = wasm_unpackhi_i8x16(u1, u3); // a12 b12 c12 d12 ...
2221
2222 wasm_v128_store(ptr, v0);
2223 wasm_v128_store(ptr + 16, v1);
2224 wasm_v128_store(ptr + 32, v2);
2225 wasm_v128_store(ptr + 48, v3);
2226}
2227
2228inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2230{
2231 v128_t v0 = wasm_unpacklo_i16x8(a.val, b.val);
2232 v128_t v1 = wasm_unpackhi_i16x8(a.val, b.val);
2233
2234 wasm_v128_store(ptr, v0);
2235 wasm_v128_store(ptr + 8, v1);
2236}
2237
2238inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2239 const v_uint16x8& b, const v_uint16x8& c,
2241{
2242 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
2243 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
2244 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
2245
2246 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
2247 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
2248 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
2249
2250 wasm_v128_store(ptr, t10);
2251 wasm_v128_store(ptr + 8, t11);
2252 wasm_v128_store(ptr + 16, t12);
2253}
2254
2255inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2256 const v_uint16x8& c, const v_uint16x8& d,
2258{
2259 // a0 a1 a2 a3 ....
2260 // b0 b1 b2 b3 ....
2261 // c0 c1 c2 c3 ....
2262 // d0 d1 d2 d3 ....
2263 v128_t u0 = wasm_unpacklo_i16x8(a.val, c.val); // a0 c0 a1 c1 ...
2264 v128_t u1 = wasm_unpackhi_i16x8(a.val, c.val); // a4 c4 a5 c5 ...
2265 v128_t u2 = wasm_unpacklo_i16x8(b.val, d.val); // b0 d0 b1 d1 ...
2266 v128_t u3 = wasm_unpackhi_i16x8(b.val, d.val); // b4 d4 b5 d5 ...
2267
2268 v128_t v0 = wasm_unpacklo_i16x8(u0, u2); // a0 b0 c0 d0 ...
2269 v128_t v1 = wasm_unpackhi_i16x8(u0, u2); // a2 b2 c2 d2 ...
2270 v128_t v2 = wasm_unpacklo_i16x8(u1, u3); // a4 b4 c4 d4 ...
2271 v128_t v3 = wasm_unpackhi_i16x8(u1, u3); // a6 b6 c6 d6 ...
2272
2273 wasm_v128_store(ptr, v0);
2274 wasm_v128_store(ptr + 8, v1);
2275 wasm_v128_store(ptr + 16, v2);
2276 wasm_v128_store(ptr + 24, v3);
2277}
2278
2279inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2281{
2282 v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2283 v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2284
2285 wasm_v128_store(ptr, v0);
2286 wasm_v128_store(ptr + 4, v1);
2287}
2288
2289inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2290 const v_uint32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2291{
2292 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2293 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2294 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2295
2296 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2297 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2298 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2299
2300 wasm_v128_store(ptr, t10);
2301 wasm_v128_store(ptr + 4, t11);
2302 wasm_v128_store(ptr + 8, t12);
2303}
2304
2305inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2306 const v_uint32x4& c, const v_uint32x4& d,
2308{
2309 v_uint32x4 v0, v1, v2, v3;
2310 v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2311
2312 wasm_v128_store(ptr, v0.val);
2313 wasm_v128_store(ptr + 4, v1.val);
2314 wasm_v128_store(ptr + 8, v2.val);
2315 wasm_v128_store(ptr + 12, v3.val);
2316}
2317
2318// 2-channel, float only
2319inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2321{
2322 v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2323 v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2324
2325 wasm_v128_store(ptr, v0);
2326 wasm_v128_store(ptr + 4, v1);
2327}
2328
2329inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2330 const v_float32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2331{
2332 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2333 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2334 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2335
2336 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2337 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2338 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2339
2340 wasm_v128_store(ptr, t10);
2341 wasm_v128_store(ptr + 4, t11);
2342 wasm_v128_store(ptr + 8, t12);
2343}
2344
2345inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2346 const v_float32x4& c, const v_float32x4& d,
2348{
2349 v_float32x4 v0, v1, v2, v3;
2350 v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2351
2352 wasm_v128_store(ptr, v0.val);
2353 wasm_v128_store(ptr + 4, v1.val);
2354 wasm_v128_store(ptr + 8, v2.val);
2355 wasm_v128_store(ptr + 12, v3.val);
2356}
2357
2358inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2360{
2361 v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2362 v128_t v1 = wasm_unpackhi_i64x2(a.val, b.val);
2363
2364 wasm_v128_store(ptr, v0);
2365 wasm_v128_store(ptr + 2, v1);
2366}
2367
2368inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2369 const v_uint64x2& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2370{
2371 v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2372 v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
2373 v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2374
2375 wasm_v128_store(ptr, v0);
2376 wasm_v128_store(ptr + 2, v1);
2377 wasm_v128_store(ptr + 4, v2);
2378}
2379
2380inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2381 const v_uint64x2& c, const v_uint64x2& d,
2383{
2384 v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2385 v128_t v1 = wasm_unpacklo_i64x2(c.val, d.val);
2386 v128_t v2 = wasm_unpackhi_i64x2(a.val, b.val);
2387 v128_t v3 = wasm_unpackhi_i64x2(c.val, d.val);
2388
2389 wasm_v128_store(ptr, v0);
2390 wasm_v128_store(ptr + 2, v1);
2391 wasm_v128_store(ptr + 4, v2);
2392 wasm_v128_store(ptr + 6, v3);
2393}
2394
2395#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2396inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2397{ \
2398 _Tpvec1 a1, b1; \
2399 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2400 a0 = v_reinterpret_as_##suffix0(a1); \
2401 b0 = v_reinterpret_as_##suffix0(b1); \
2402} \
2403inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2404{ \
2405 _Tpvec1 a1, b1, c1; \
2406 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2407 a0 = v_reinterpret_as_##suffix0(a1); \
2408 b0 = v_reinterpret_as_##suffix0(b1); \
2409 c0 = v_reinterpret_as_##suffix0(c1); \
2410} \
2411inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2412{ \
2413 _Tpvec1 a1, b1, c1, d1; \
2414 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2415 a0 = v_reinterpret_as_##suffix0(a1); \
2416 b0 = v_reinterpret_as_##suffix0(b1); \
2417 c0 = v_reinterpret_as_##suffix0(c1); \
2418 d0 = v_reinterpret_as_##suffix0(d1); \
2419} \
2420inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2421 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2422{ \
2423 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2424 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2425 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2426} \
2427inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2428 const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2429{ \
2430 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2431 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2432 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2433 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2434} \
2435inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2436 const _Tpvec0& c0, const _Tpvec0& d0, \
2437 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2438{ \
2439 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2440 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2441 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2442 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2443 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2444}
2445
2446OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2447OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2448OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2449OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2450OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2451
2452inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2453{
2454 return v_float32x4(wasm_f32x4_convert_i32x4(a.val));
2455}
2456
2457inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2458{
2459 double a_[2];
2460 wasm_v128_store(a_, a.val);
2461 float c_[4];
2462 c_[0] = (float)(a_[0]);
2463 c_[1] = (float)(a_[1]);
2464 c_[2] = 0;
2465 c_[3] = 0;
2466 return v_float32x4(wasm_v128_load(c_));
2467}
2468
2469inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2470{
2471 double a_[2], b_[2];
2472 wasm_v128_store(a_, a.val);
2473 wasm_v128_store(b_, b.val);
2474 float c_[4];
2475 c_[0] = (float)(a_[0]);
2476 c_[1] = (float)(a_[1]);
2477 c_[2] = (float)(b_[0]);
2478 c_[3] = (float)(b_[1]);
2479 return v_float32x4(wasm_v128_load(c_));
2480}
2481
2482inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2483{
2484#ifdef __wasm_unimplemented_simd128__
2485 v128_t p = v128_cvti32x4_i64x2(a.val);
2486 return v_float64x2(wasm_f64x2_convert_i64x2(p));
2487#else
2488 int a_[4];
2489 wasm_v128_store(a_, a.val);
2490 double c_[2];
2491 c_[0] = (double)(a_[0]);
2492 c_[1] = (double)(a_[1]);
2493 return v_float64x2(wasm_v128_load(c_));
2494#endif
2495}
2496
2497inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2498{
2499#ifdef __wasm_unimplemented_simd128__
2500 v128_t p = v128_cvti32x4_i64x2_high(a.val);
2501 return v_float64x2(wasm_f64x2_convert_i64x2(p));
2502#else
2503 int a_[4];
2504 wasm_v128_store(a_, a.val);
2505 double c_[2];
2506 c_[0] = (double)(a_[2]);
2507 c_[1] = (double)(a_[3]);
2508 return v_float64x2(wasm_v128_load(c_));
2509#endif
2510}
2511
2512inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2513{
2514 float a_[4];
2515 wasm_v128_store(a_, a.val);
2516 double c_[2];
2517 c_[0] = (double)(a_[0]);
2518 c_[1] = (double)(a_[1]);
2519 return v_float64x2(wasm_v128_load(c_));
2520}
2521
2522inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2523{
2524 float a_[4];
2525 wasm_v128_store(a_, a.val);
2526 double c_[2];
2527 c_[0] = (double)(a_[2]);
2528 c_[1] = (double)(a_[3]);
2529 return v_float64x2(wasm_v128_load(c_));
2530}
2531
2532inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2533{
2534#ifdef __wasm_unimplemented_simd128__
2535 return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
2536#else
2537 int64 a_[2];
2538 wasm_v128_store(a_, a.val);
2539 double c_[2];
2540 c_[0] = (double)(a_[0]);
2541 c_[1] = (double)(a_[1]);
2542 return v_float64x2(wasm_v128_load(c_));
2543#endif
2544}
2545
2547
2548inline v_int8x16 v_lut(const schar* tab, const int* idx)
2549{
2550 return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
2551 tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
2552}
2553inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
2554{
2555 return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1],
2556 tab[idx[4]], tab[idx[4]+1], tab[idx[5]], tab[idx[5]+1], tab[idx[6]], tab[idx[6]+1], tab[idx[7]], tab[idx[7]+1]);
2557}
2558inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2559{
2560 return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3],
2561 tab[idx[2]], tab[idx[2]+1], tab[idx[2]+2], tab[idx[2]+3], tab[idx[3]], tab[idx[3]+1], tab[idx[3]+2], tab[idx[3]+3]);
2562}
2563inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
2564inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
2565inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
2566
2567inline v_int16x8 v_lut(const short* tab, const int* idx)
2568{
2569 return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
2570 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
2571}
2572inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2573{
2574 return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1],
2575 tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1]);
2576}
2577inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2578{
2579 return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3],
2580 tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3]);
2581}
2582inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
2583inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
2584inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
2585
2586inline v_int32x4 v_lut(const int* tab, const int* idx)
2587{
2588 return v_int32x4(tab[idx[0]], tab[idx[1]],
2589 tab[idx[2]], tab[idx[3]]);
2590}
2591inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2592{
2593 return v_int32x4(tab[idx[0]], tab[idx[0]+1],
2594 tab[idx[1]], tab[idx[1]+1]);
2595}
2596inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2597{
2598 return v_int32x4(wasm_v128_load(tab + idx[0]));
2599}
2600inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
2601inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
2602inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
2603
2604inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2605{
2606 return v_int64x2(tab[idx[0]], tab[idx[1]]);
2607}
2608inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2609{
2610 return v_int64x2(wasm_v128_load(tab + idx[0]));
2611}
2612inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
2613inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2614
2615inline v_float32x4 v_lut(const float* tab, const int* idx)
2616{
2617 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2618}
2619inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
2620inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
2621
2622inline v_float64x2 v_lut(const double* tab, const int* idx)
2623{
2624 return v_float64x2(tab[idx[0]], tab[idx[1]]);
2625}
2626inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2627{
2628 return v_float64x2(wasm_v128_load(tab + idx[0]));
2629}
2630
2631inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2632{
2633 return v_int32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2634 tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2635 tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2636 tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2637}
2638
2639inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2640{
2641 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
2642}
2643
2644inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2645{
2646 return v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2647 tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2648 tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2649 tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2650}
2651
2652inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2653{
2654 return v_float64x2(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2655 tab[wasm_i32x4_extract_lane(idxvec.val, 1)]);
2656}
2657
2658// loads pairs from the table and deinterleaves them, e.g. returns:
2659// x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
2660// y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
2661// note that the indices are float's indices, not the float-pair indices.
2662// in theory, this function can be used to implement bilinear interpolation,
2663// when idxvec are the offsets within the image.
2664inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2665{
2666 x = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2667 tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2668 tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2669 tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2670 y = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)+1],
2671 tab[wasm_i32x4_extract_lane(idxvec.val, 1)+1],
2672 tab[wasm_i32x4_extract_lane(idxvec.val, 2)+1],
2673 tab[wasm_i32x4_extract_lane(idxvec.val, 3)+1]);
2674}
2675
2676inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2677{
2678 v128_t xy0 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 0));
2679 v128_t xy1 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 1));
2680 x.val = wasm_unpacklo_i64x2(xy0, xy1);
2681 y.val = wasm_unpacklo_i64x2(xy0, xy1);
2682}
2683
2684inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2685{
2686 return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
2687}
2688inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
2689inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2690{
2691 return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
2692}
2693inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2694
2695inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2696{
2697 return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
2698}
2699inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2700inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2701{
2702 return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
2703}
2704inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2705
2706inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2707{
2708 return v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2709}
2710inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2712{
2713 return v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2714}
2715
2716inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2717{
2718 return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
2719}
2720inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2721
2722inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2723{
2724 return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
2725}
2726inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2727
2728inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2729inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2730inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2731
2732template<int i, typename _Tp>
2733inline typename _Tp::lane_type v_extract_n(const _Tp& a)
2734{
2735 return v_rotate_right<i>(a).get0();
2736}
2737
2738template<int i>
2740{
2741 return v_setall_u32(v_extract_n<i>(a));
2742}
2743template<int i>
2745{
2746 return v_setall_s32(v_extract_n<i>(a));
2747}
2748template<int i>
2750{
2751 return v_setall_f32(v_extract_n<i>(a));
2752}
2753
2754
2756
2757inline v_float32x4 v_load_expand(const hfloat* ptr)
2758{
2759 float a[4];
2760 for (int i = 0; i < 4; i++)
2761 a[i] = ptr[i];
2762 return v_float32x4(wasm_v128_load(a));
2763}
2764
2765inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2766{
2767 double v_[4];
2768 wasm_v128_store(v_, v.val);
2769 ptr[0] = hfloat(v_[0]);
2770 ptr[1] = hfloat(v_[1]);
2771 ptr[2] = hfloat(v_[2]);
2772 ptr[3] = hfloat(v_[3]);
2773}
2774
2775inline void v_cleanup() {}
2776
2777CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2778
2780
2781}
2782
2783#endif
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition intrin_cpp.hpp:1474
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition intrin_cpp.hpp:2761
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract.
Definition intrin_cpp.hpp:2371
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
CV_INLINE int cvRound(double value)
Rounds floating-point number to the nearest integer.
Definition fast_math.hpp:200
CV_INLINE int cvCeil(double value)
Rounds floating-point number to the nearest integer not smaller than the original.
Definition fast_math.hpp:258
CV_INLINE int cvFloor(double value)
Rounds floating-point number to the nearest integer not larger than the original.
Definition fast_math.hpp:231
CvRect r
Definition imgproc_c.h:984
CvSize int int int CvPoint int delta
Definition imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
T max(T... args)
T min(T... args)
StoreMode
Definition intrin.hpp:100
@ STORE_UNALIGNED
Definition intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441