EstervQrCode 1.1.1
Library for qr code manipulation
intrin_wasm.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #ifndef OPENCV_HAL_INTRIN_WASM_HPP
6 #define OPENCV_HAL_INTRIN_WASM_HPP
7 
8 #include <limits>
9 #include <cstring>
10 #include <algorithm>
11 #include <emscripten/version.h>
12 #include "opencv2/core/saturate.hpp"
13 
14 #define CV_SIMD128 1
15 #define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it.
16 #define CV_SIMD128_FP16 0
17 
18 namespace cv
19 {
20 
22 
23 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
24 
25 #if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
26 // handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673)
27 #define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
28 #define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
29 #define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
30 #define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
31 #define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
32 #define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
33 #define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
34 #define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
35 #endif // COMPATIBILITY: <1.38.46
36 
38 
39 struct v_uint8x16
40 {
41  typedef uchar lane_type;
42  typedef v128_t vector_type;
43  enum { nlanes = 16 };
44 
45  v_uint8x16() {}
46  explicit v_uint8x16(v128_t v) : val(v) {}
47  v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
48  uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
49  {
50  uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
51  val = wasm_v128_load(v);
52  }
53 
54  uchar get0() const
55  {
56  return (uchar)wasm_i8x16_extract_lane(val, 0);
57  }
58 
59  v128_t val;
60 };
61 
62 struct v_int8x16
63 {
64  typedef schar lane_type;
65  typedef v128_t vector_type;
66  enum { nlanes = 16 };
67 
68  v_int8x16() {}
69  explicit v_int8x16(v128_t v) : val(v) {}
70  v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
71  schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
72  {
73  schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
74  val = wasm_v128_load(v);
75  }
76 
77  schar get0() const
78  {
79  return wasm_i8x16_extract_lane(val, 0);
80  }
81 
82  v128_t val;
83 };
84 
85 struct v_uint16x8
86 {
87  typedef ushort lane_type;
88  typedef v128_t vector_type;
89  enum { nlanes = 8 };
90 
91  v_uint16x8() {}
92  explicit v_uint16x8(v128_t v) : val(v) {}
93  v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
94  {
95  ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96  val = wasm_v128_load(v);
97  }
98 
99  ushort get0() const
100  {
101  return (ushort)wasm_i16x8_extract_lane(val, 0); // wasm_u16x8_extract_lane() unimplemented yet
102  }
103 
104  v128_t val;
105 };
106 
107 struct v_int16x8
108 {
109  typedef short lane_type;
110  typedef v128_t vector_type;
111  enum { nlanes = 8 };
112 
113  v_int16x8() {}
114  explicit v_int16x8(v128_t v) : val(v) {}
115  v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
116  {
117  short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
118  val = wasm_v128_load(v);
119  }
120 
121  short get0() const
122  {
123  return wasm_i16x8_extract_lane(val, 0);
124  }
125 
126  v128_t val;
127 };
128 
129 struct v_uint32x4
130 {
131  typedef unsigned lane_type;
132  typedef v128_t vector_type;
133  enum { nlanes = 4 };
134 
135  v_uint32x4() {}
136  explicit v_uint32x4(v128_t v) : val(v) {}
137  v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
138  {
139  unsigned v[] = {v0, v1, v2, v3};
140  val = wasm_v128_load(v);
141  }
142 
143  unsigned get0() const
144  {
145  return (unsigned)wasm_i32x4_extract_lane(val, 0);
146  }
147 
148  v128_t val;
149 };
150 
151 struct v_int32x4
152 {
153  typedef int lane_type;
154  typedef v128_t vector_type;
155  enum { nlanes = 4 };
156 
157  v_int32x4() {}
158  explicit v_int32x4(v128_t v) : val(v) {}
159  v_int32x4(int v0, int v1, int v2, int v3)
160  {
161  int v[] = {v0, v1, v2, v3};
162  val = wasm_v128_load(v);
163  }
164 
165  int get0() const
166  {
167  return wasm_i32x4_extract_lane(val, 0);
168  }
169 
170  v128_t val;
171 };
172 
173 struct v_float32x4
174 {
175  typedef float lane_type;
176  typedef v128_t vector_type;
177  enum { nlanes = 4 };
178 
179  v_float32x4() {}
180  explicit v_float32x4(v128_t v) : val(v) {}
181  v_float32x4(float v0, float v1, float v2, float v3)
182  {
183  float v[] = {v0, v1, v2, v3};
184  val = wasm_v128_load(v);
185  }
186 
187  float get0() const
188  {
189  return wasm_f32x4_extract_lane(val, 0);
190  }
191 
192  v128_t val;
193 };
194 
195 struct v_uint64x2
196 {
197  typedef uint64 lane_type;
198  typedef v128_t vector_type;
199  enum { nlanes = 2 };
200 
201  v_uint64x2() {}
202  explicit v_uint64x2(v128_t v) : val(v) {}
203  v_uint64x2(uint64 v0, uint64 v1)
204  {
205  uint64 v[] = {v0, v1};
206  val = wasm_v128_load(v);
207  }
208 
209  uint64 get0() const
210  {
211  return (uint64)wasm_i64x2_extract_lane(val, 0);
212  }
213 
214  v128_t val;
215 };
216 
217 struct v_int64x2
218 {
219  typedef int64 lane_type;
220  typedef v128_t vector_type;
221  enum { nlanes = 2 };
222 
223  v_int64x2() {}
224  explicit v_int64x2(v128_t v) : val(v) {}
225  v_int64x2(int64 v0, int64 v1)
226  {
227  int64 v[] = {v0, v1};
228  val = wasm_v128_load(v);
229  }
230 
231  int64 get0() const
232  {
233  return wasm_i64x2_extract_lane(val, 0);
234  }
235 
236  v128_t val;
237 };
238 
239 struct v_float64x2
240 {
241  typedef double lane_type;
242  typedef v128_t vector_type;
243  enum { nlanes = 2 };
244 
245  v_float64x2() {}
246  explicit v_float64x2(v128_t v) : val(v) {}
247  v_float64x2(double v0, double v1)
248  {
249  double v[] = {v0, v1};
250  val = wasm_v128_load(v);
251  }
252 
253  double get0() const
254  {
255  return wasm_f64x2_extract_lane(val, 0);
256  }
257 
258  v128_t val;
259 };
260 
261 namespace
262 {
263 #define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
264 inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
265 OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
266 OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
267 OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
268 OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
269 OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
270 OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
271 OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
272 OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
273 OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
274 OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
275 
276 static const unsigned char popCountTable[] =
277 {
278  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
279  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
280  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
281  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
282  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
283  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
284  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
285  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
286  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
287  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
288  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
289  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
290  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
291  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
292  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
293  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
294 };
295 } // namespace
296 
297 static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
298  return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
299 }
300 
301 static v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) {
302  return wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
303 }
304 
305 static v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) {
306  return wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
307 }
308 
309 static v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) {
310  return wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
311 }
312 
313 static v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) {
314  return wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
315 }
316 
317 static v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) {
318  return wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
319 }
320 
321 static v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) {
322  return wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
323 }
324 
325 static v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
326  return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
327 }
328 
330 // 8 >> 16
331 inline v128_t v128_cvtu8x16_i16x8(const v128_t& a)
332 {
333  const v128_t z = wasm_i8x16_splat(0);
334  return wasm_unpacklo_i8x16(a, z);
335 }
336 inline v128_t v128_cvti8x16_i16x8(const v128_t& a)
337 { return wasm_i16x8_shr(wasm_unpacklo_i8x16(a, a), 8); }
338 // 8 >> 32
339 inline v128_t v128_cvtu8x16_i32x4(const v128_t& a)
340 {
341  const v128_t z = wasm_i8x16_splat(0);
342  return wasm_unpacklo_i16x8(wasm_unpacklo_i8x16(a, z), z);
343 }
344 inline v128_t v128_cvti8x16_i32x4(const v128_t& a)
345 {
346  v128_t r = wasm_unpacklo_i8x16(a, a);
347  r = wasm_unpacklo_i8x16(r, r);
348  return wasm_i32x4_shr(r, 24);
349 }
350 // 16 >> 32
351 inline v128_t v128_cvtu16x8_i32x4(const v128_t& a)
352 {
353  const v128_t z = wasm_i8x16_splat(0);
354  return wasm_unpacklo_i16x8(a, z);
355 }
356 inline v128_t v128_cvti16x8_i32x4(const v128_t& a)
357 { return wasm_i32x4_shr(wasm_unpacklo_i16x8(a, a), 16); }
358 // 32 >> 64
359 inline v128_t v128_cvtu32x4_i64x2(const v128_t& a)
360 {
361  const v128_t z = wasm_i8x16_splat(0);
362  return wasm_unpacklo_i32x4(a, z);
363 }
364 inline v128_t v128_cvti32x4_i64x2(const v128_t& a)
365 { return wasm_unpacklo_i32x4(a, wasm_i32x4_shr(a, 31)); }
366 
367 // 16 << 8
368 inline v128_t v128_cvtu8x16_i16x8_high(const v128_t& a)
369 {
370  const v128_t z = wasm_i8x16_splat(0);
371  return wasm_unpackhi_i8x16(a, z);
372 }
373 inline v128_t v128_cvti8x16_i16x8_high(const v128_t& a)
374 { return wasm_i16x8_shr(wasm_unpackhi_i8x16(a, a), 8); }
375 // 32 << 16
376 inline v128_t v128_cvtu16x8_i32x4_high(const v128_t& a)
377 {
378  const v128_t z = wasm_i8x16_splat(0);
379  return wasm_unpackhi_i16x8(a, z);
380 }
381 inline v128_t v128_cvti16x8_i32x4_high(const v128_t& a)
382 { return wasm_i32x4_shr(wasm_unpackhi_i16x8(a, a), 16); }
383 // 64 << 32
384 inline v128_t v128_cvtu32x4_i64x2_high(const v128_t& a)
385 {
386  const v128_t z = wasm_i8x16_splat(0);
387  return wasm_unpackhi_i32x4(a, z);
388 }
389 inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a)
390 { return wasm_unpackhi_i32x4(a, wasm_i32x4_shr(a, 31)); }
391 
392 #define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
393 inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
394 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
395 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
396 { return _Tpvec(a.val); }
397 
398 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16, uchar, u8, i8x16, schar)
399 OPENCV_HAL_IMPL_WASM_INITVEC(v_int8x16, schar, s8, i8x16, schar)
400 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint16x8, ushort, u16, i16x8, short)
401 OPENCV_HAL_IMPL_WASM_INITVEC(v_int16x8, short, s16, i16x8, short)
402 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint32x4, unsigned, u32, i32x4, int)
403 OPENCV_HAL_IMPL_WASM_INITVEC(v_int32x4, int, s32, i32x4, int)
404 OPENCV_HAL_IMPL_WASM_INITVEC(v_float32x4, float, f32, f32x4, float)
405 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint64x2, uint64, u64, i64x2, int64)
406 OPENCV_HAL_IMPL_WASM_INITVEC(v_int64x2, int64, s64, i64x2, int64)
407 OPENCV_HAL_IMPL_WASM_INITVEC(v_float64x2, double, f64, f64x2, double)
408 
409 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
411 {
412  v128_t maxval = wasm_i16x8_splat(255);
413  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
414  v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
415  return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
416 }
417 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
418 {
419  v128_t maxval = wasm_i16x8_splat(127);
420  v128_t minval = wasm_i16x8_splat(-128);
421  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
422  v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
423  v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
424  v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
425  return v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
426 }
427 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
428 {
429  v128_t maxval = wasm_i32x4_splat(65535);
430  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
431  v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
432  return v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
433 }
434 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
435 {
436  v128_t maxval = wasm_i32x4_splat(32767);
437  v128_t minval = wasm_i32x4_splat(-32768);
438  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
439  v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
440  v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
441  v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
442  return v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
443 }
444 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
445 {
446  return v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
447 }
448 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
449 {
450  return v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
451 }
452 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
453 {
454  v128_t maxval = wasm_i16x8_splat(255);
455  v128_t minval = wasm_i16x8_splat(0);
456  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
457  v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
458  v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
459  v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
460  return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
461 }
462 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
463 {
464  v128_t maxval = wasm_i32x4_splat(65535);
465  v128_t minval = wasm_i32x4_splat(0);
466  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
467  v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
468  v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
469  v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
470  return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
471 }
472 
473 template<int n>
474 inline v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
475 {
476  v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
477  v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
478  v128_t b1 = wasm_u16x8_shr(wasm_i16x8_add(b.val, delta), n);
479  v128_t maxval = wasm_i16x8_splat(255);
480  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
481  v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval));
482  return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
483 }
484 template<int n>
485 inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
486 {
487  v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
488  v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
489  v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
490  v128_t maxval = wasm_i16x8_splat(127);
491  v128_t minval = wasm_i16x8_splat(-128);
492  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
493  v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
494  v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
495  v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
496  return v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
497 }
498 template<int n>
499 inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
500 {
501  v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
502  v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
503  v128_t b1 = wasm_u32x4_shr(wasm_i32x4_add(b.val, delta), n);
504  v128_t maxval = wasm_i32x4_splat(65535);
505  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
506  v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval));
507  return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
508 }
509 template<int n>
510 inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
511 {
512  v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
513  v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
514  v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
515  v128_t maxval = wasm_i32x4_splat(32767);
516  v128_t minval = wasm_i16x8_splat(-32768);
517  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
518  v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
519  v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
520  v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
521  return v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
522 }
523 template<int n>
524 inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
525 {
526  v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
527  v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
528  v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n);
529  return v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
530 }
531 template<int n>
532 inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
533 {
534  v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
535  v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
536  v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n);
537  return v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
538 }
539 template<int n>
540 inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
541 {
542  v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
543  v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
544  v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
545  v128_t maxval = wasm_i16x8_splat(255);
546  v128_t minval = wasm_i16x8_splat(0);
547  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
548  v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
549  v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
550  v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
551  return v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
552 }
553 template<int n>
554 inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
555 {
556  v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
557  v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
558  v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
559  v128_t maxval = wasm_i32x4_splat(65535);
560  v128_t minval = wasm_i16x8_splat(0);
561  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
562  v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
563  v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
564  v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
565  return v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
566 }
567 
568 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
569 {
570  v128_t maxval = wasm_i16x8_splat(255);
571  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
572  v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
573  uchar t_ptr[16];
574  wasm_v128_store(t_ptr, r);
575  for (int i=0; i<8; ++i) {
576  ptr[i] = t_ptr[i];
577  }
578 }
579 inline void v_pack_store(schar* ptr, const v_int16x8& a)
580 {
581  v128_t maxval = wasm_i16x8_splat(127);
582  v128_t minval = wasm_i16x8_splat(-128);
583  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
584  v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
585  v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
586  schar t_ptr[16];
587  wasm_v128_store(t_ptr, r);
588  for (int i=0; i<8; ++i) {
589  ptr[i] = t_ptr[i];
590  }
591 }
592 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
593 {
594  v128_t maxval = wasm_i32x4_splat(65535);
595  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
596  v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
597  ushort t_ptr[8];
598  wasm_v128_store(t_ptr, r);
599  for (int i=0; i<4; ++i) {
600  ptr[i] = t_ptr[i];
601  }
602 }
603 inline void v_pack_store(short* ptr, const v_int32x4& a)
604 {
605  v128_t maxval = wasm_i32x4_splat(32767);
606  v128_t minval = wasm_i32x4_splat(-32768);
607  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
608  v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
609  v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
610  short t_ptr[8];
611  wasm_v128_store(t_ptr, r);
612  for (int i=0; i<4; ++i) {
613  ptr[i] = t_ptr[i];
614  }
615 }
616 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
617 {
618  v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
619  unsigned t_ptr[4];
620  wasm_v128_store(t_ptr, r);
621  for (int i=0; i<2; ++i) {
622  ptr[i] = t_ptr[i];
623  }
624 }
625 inline void v_pack_store(int* ptr, const v_int64x2& a)
626 {
627  v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
628  int t_ptr[4];
629  wasm_v128_store(t_ptr, r);
630  for (int i=0; i<2; ++i) {
631  ptr[i] = t_ptr[i];
632  }
633 }
634 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
635 {
636  v128_t maxval = wasm_i16x8_splat(255);
637  v128_t minval = wasm_i16x8_splat(0);
638  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
639  v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
640  v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
641  uchar t_ptr[16];
642  wasm_v128_store(t_ptr, r);
643  for (int i=0; i<8; ++i) {
644  ptr[i] = t_ptr[i];
645  }
646 }
647 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
648 {
649  v128_t maxval = wasm_i32x4_splat(65535);
650  v128_t minval = wasm_i32x4_splat(0);
651  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
652  v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
653  v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
654  ushort t_ptr[8];
655  wasm_v128_store(t_ptr, r);
656  for (int i=0; i<4; ++i) {
657  ptr[i] = t_ptr[i];
658  }
659 }
660 
661 template<int n>
662 inline void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
663 {
664  v128_t delta = wasm_i16x8_splat((short)(1 << (n-1)));
665  v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
666  v128_t maxval = wasm_i16x8_splat(255);
667  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
668  v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
669  uchar t_ptr[16];
670  wasm_v128_store(t_ptr, r);
671  for (int i=0; i<8; ++i) {
672  ptr[i] = t_ptr[i];
673  }
674 }
675 template<int n>
676 inline void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
677 {
678  v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
679  v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
680  v128_t maxval = wasm_i16x8_splat(127);
681  v128_t minval = wasm_i16x8_splat(-128);
682  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
683  v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
684  v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
685  schar t_ptr[16];
686  wasm_v128_store(t_ptr, r);
687  for (int i=0; i<8; ++i) {
688  ptr[i] = t_ptr[i];
689  }
690 }
691 template<int n>
692 inline void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
693 {
694  v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
695  v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
696  v128_t maxval = wasm_i32x4_splat(65535);
697  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
698  v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
699  ushort t_ptr[8];
700  wasm_v128_store(t_ptr, r);
701  for (int i=0; i<4; ++i) {
702  ptr[i] = t_ptr[i];
703  }
704 }
705 template<int n>
706 inline void v_rshr_pack_store(short* ptr, const v_int32x4& a)
707 {
708  v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
709  v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
710  v128_t maxval = wasm_i32x4_splat(32767);
711  v128_t minval = wasm_i32x4_splat(-32768);
712  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
713  v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
714  v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
715  short t_ptr[8];
716  wasm_v128_store(t_ptr, r);
717  for (int i=0; i<4; ++i) {
718  ptr[i] = t_ptr[i];
719  }
720 }
721 template<int n>
722 inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
723 {
724  v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
725  v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
726  v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
727  unsigned t_ptr[4];
728  wasm_v128_store(t_ptr, r);
729  for (int i=0; i<2; ++i) {
730  ptr[i] = t_ptr[i];
731  }
732 }
733 template<int n>
734 inline void v_rshr_pack_store(int* ptr, const v_int64x2& a)
735 {
736  v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
737  v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
738  v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
739  int t_ptr[4];
740  wasm_v128_store(t_ptr, r);
741  for (int i=0; i<2; ++i) {
742  ptr[i] = t_ptr[i];
743  }
744 }
745 template<int n>
746 inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
747 {
748  v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
749  v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
750  v128_t maxval = wasm_i16x8_splat(255);
751  v128_t minval = wasm_i16x8_splat(0);
752  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
753  v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
754  v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
755  uchar t_ptr[16];
756  wasm_v128_store(t_ptr, r);
757  for (int i=0; i<8; ++i) {
758  ptr[i] = t_ptr[i];
759  }
760 }
761 template<int n>
762 inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
763 {
764  v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
765  v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
766  v128_t maxval = wasm_i32x4_splat(65535);
767  v128_t minval = wasm_i32x4_splat(0);
768  v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
769  v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
770  v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
771  ushort t_ptr[8];
772  wasm_v128_store(t_ptr, r);
773  for (int i=0; i<4; ++i) {
774  ptr[i] = t_ptr[i];
775  }
776 }
777 
778 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
779 {
780  v128_t maxval = wasm_i16x8_splat(255);
781  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
782  v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
783  return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
784 }
785 
786 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
787  const v_uint32x4& c, const v_uint32x4& d)
788 {
789  v128_t maxval = wasm_i32x4_splat(255);
790  v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
791  v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
792  v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval));
793  v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval));
794  v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
795  v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
796  return v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
797 }
798 
799 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
800  const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
801  const v_uint64x2& g, const v_uint64x2& h)
802 {
803  v128_t maxval = wasm_i32x4_splat(255);
804  v128_t a1 = wasm_v128_bitselect(maxval, a.val, ((__u64x2)(a.val) > (__u64x2)maxval));
805  v128_t b1 = wasm_v128_bitselect(maxval, b.val, ((__u64x2)(b.val) > (__u64x2)maxval));
806  v128_t c1 = wasm_v128_bitselect(maxval, c.val, ((__u64x2)(c.val) > (__u64x2)maxval));
807  v128_t d1 = wasm_v128_bitselect(maxval, d.val, ((__u64x2)(d.val) > (__u64x2)maxval));
808  v128_t e1 = wasm_v128_bitselect(maxval, e.val, ((__u64x2)(e.val) > (__u64x2)maxval));
809  v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval));
810  v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval));
811  v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval));
812  v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
813  v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
814  v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
815  v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
816  v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
817  v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
818  return v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
819 }
820 
821 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
822  const v_float32x4& m1, const v_float32x4& m2,
823  const v_float32x4& m3)
824 {
825  v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
826  v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
827  v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
828  v128_t v3 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 3));
829  v0 = wasm_f32x4_mul(v0, m0.val);
830  v1 = wasm_f32x4_mul(v1, m1.val);
831  v2 = wasm_f32x4_mul(v2, m2.val);
832  v3 = wasm_f32x4_mul(v3, m3.val);
833 
834  return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, v3)));
835 }
836 
837 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
838  const v_float32x4& m1, const v_float32x4& m2,
839  const v_float32x4& a)
840 {
841  v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
842  v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
843  v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
844  v0 = wasm_f32x4_mul(v0, m0.val);
845  v1 = wasm_f32x4_mul(v1, m1.val);
846  v2 = wasm_f32x4_mul(v2, m2.val);
847 
848  return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, a.val)));
849 }
850 
851 #define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
852 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
853 { \
854  return _Tpvec(intrin(a.val, b.val)); \
855 } \
856 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
857 { \
858  a.val = intrin(a.val, b.val); \
859  return a; \
860 }
861 
862 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
863 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
864 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
865 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
866 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
867 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
868 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
869 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
870 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
871 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
872 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
873 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
874 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
875 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
876 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
877 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
878 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
879 OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
880 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
881 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
882 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
883 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
884 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
885 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
886 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
887 OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
888 
889 // saturating multiply 8-bit, 16-bit
890 #define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec) \
891 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
892 { \
893  _Tpwvec c, d; \
894  v_mul_expand(a, b, c, d); \
895  return v_pack(c, d); \
896 } \
897 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
898 { a = a * b; return a; }
899 
900 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
901 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16, v_int16x8)
902 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint16x8, v_uint32x4)
903 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int16x8, v_int32x4)
904 
905 // Multiply and expand
906 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
907  v_uint16x8& c, v_uint16x8& d)
908 {
909  v_uint16x8 a0, a1, b0, b1;
910  v_expand(a, a0, a1);
911  v_expand(b, b0, b1);
912  c = v_mul_wrap(a0, b0);
913  d = v_mul_wrap(a1, b1);
914 }
915 
916 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
917  v_int16x8& c, v_int16x8& d)
918 {
919  v_int16x8 a0, a1, b0, b1;
920  v_expand(a, a0, a1);
921  v_expand(b, b0, b1);
922  c = v_mul_wrap(a0, b0);
923  d = v_mul_wrap(a1, b1);
924 }
925 
926 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
927  v_int32x4& c, v_int32x4& d)
928 {
929  v_int32x4 a0, a1, b0, b1;
930  v_expand(a, a0, a1);
931  v_expand(b, b0, b1);
932  c.val = wasm_i32x4_mul(a0.val, b0.val);
933  d.val = wasm_i32x4_mul(a1.val, b1.val);
934 }
935 
936 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
937  v_uint32x4& c, v_uint32x4& d)
938 {
939  v_uint32x4 a0, a1, b0, b1;
940  v_expand(a, a0, a1);
941  v_expand(b, b0, b1);
942  c.val = wasm_i32x4_mul(a0.val, b0.val);
943  d.val = wasm_i32x4_mul(a1.val, b1.val);
944 }
945 
946 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
947  v_uint64x2& c, v_uint64x2& d)
948 {
949  v_uint64x2 a0, a1, b0, b1;
950  v_expand(a, a0, a1);
951  v_expand(b, b0, b1);
952  c.val = ((__u64x2)(a0.val) * (__u64x2)(b0.val));
953  d.val = ((__u64x2)(a1.val) * (__u64x2)(b1.val));
954 }
955 
956 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
957 {
958  v_int32x4 a0, a1, b0, b1;
959  v_expand(a, a0, a1);
960  v_expand(b, b0, b1);
961  v128_t c = wasm_i32x4_mul(a0.val, b0.val);
962  v128_t d = wasm_i32x4_mul(a1.val, b1.val);
963  return v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
964 }
965 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
966 {
967  v_uint32x4 a0, a1, b0, b1;
968  v_expand(a, a0, a1);
969  v_expand(b, b0, b1);
970  v128_t c = wasm_i32x4_mul(a0.val, b0.val);
971  v128_t d = wasm_i32x4_mul(a1.val, b1.val);
972  return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
973 }
974 
976 
977 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
978 {
979  v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
980  v128_t a1 = wasm_i32x4_shr(a.val, 16);
981  v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
982  v128_t b1 = wasm_i32x4_shr(b.val, 16);
983  v128_t c = wasm_i32x4_mul(a0, b0);
984  v128_t d = wasm_i32x4_mul(a1, b1);
985  return v_int32x4(wasm_i32x4_add(c, d));
986 }
987 
988 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
989 { return v_dotprod(a, b) + c; }
990 
991 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
992 {
993  v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
994  v128_t a1 = wasm_i64x2_shr(a.val, 32);
995  v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
996  v128_t b1 = wasm_i64x2_shr(b.val, 32);
997  v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
998  v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
999  return v_int64x2(wasm_i64x2_add(c, d));
1000 }
1001 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1002 {
1003  return v_dotprod(a, b) + c;
1004 }
1005 
1006 // 8 >> 32
1007 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
1008 {
1009  v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1010  v128_t a1 = wasm_u16x8_shr(a.val, 8);
1011  v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1012  v128_t b1 = wasm_u16x8_shr(b.val, 8);
1013  return v_uint32x4((
1014  v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
1015  v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
1016  );
1017 }
1018 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1019 { return v_dotprod_expand(a, b) + c; }
1020 
1021 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
1022 {
1023  v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1024  v128_t a1 = wasm_i16x8_shr(a.val, 8);
1025  v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1026  v128_t b1 = wasm_i16x8_shr(b.val, 8);
1027  return v_int32x4(
1028  v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
1029  v_dotprod(v_int16x8(a1), v_int16x8(b1))
1030  );
1031 }
1032 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1033 { return v_dotprod_expand(a, b) + c; }
1034 
1035 // 16 >> 64
1036 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
1037 {
1038  v128_t a0 = wasm_u32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1039  v128_t a1 = wasm_u32x4_shr(a.val, 16);
1040  v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1041  v128_t b1 = wasm_u32x4_shr(b.val, 16);
1042  return v_uint64x2((
1043  v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
1044  v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
1045  );
1046 }
1047 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1048 { return v_dotprod_expand(a, b) + c; }
1049 
1050 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
1051 {
1052  v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1053  v128_t a1 = wasm_i32x4_shr(a.val, 16);
1054  v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1055  v128_t b1 = wasm_i32x4_shr(b.val, 16);
1056  return v_int64x2((
1057  v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
1058  v_dotprod(v_int32x4(a1), v_int32x4(b1)))
1059  );
1060 }
1061 
1062 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1063 { return v_dotprod_expand(a, b) + c; }
1064 
1065 // 32 >> 64f
1066 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
1067 { return v_cvt_f64(v_dotprod(a, b)); }
1068 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1069 { return v_dotprod_expand(a, b) + c; }
1070 
1072 
1073 // 16 >> 32
1074 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
1075 { return v_dotprod(a, b); }
1076 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1077 { return v_dotprod(a, b, c); }
1078 
1079 // 32 >> 64
1080 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
1081 { return v_dotprod(a, b); }
1082 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1083 { return v_dotprod(a, b, c); }
1084 
1085 // 8 >> 32
1086 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1087 { return v_dotprod_expand(a, b); }
1088 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1089 { return v_dotprod_expand(a, b, c); }
1090 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1091 { return v_dotprod_expand(a, b); }
1092 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1093 { return v_dotprod_expand(a, b, c); }
1094 
1095 // 16 >> 64
1096 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1097 { return v_dotprod_expand(a, b); }
1098 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1099 { return v_dotprod_expand(a, b, c); }
1100 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1101 { return v_dotprod_expand(a, b); }
1102 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1103 { return v_dotprod_expand(a, b, c); }
1104 
1105 // 32 >> 64f
1106 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1107 { return v_dotprod_expand(a, b); }
1108 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1109 { return v_dotprod_expand(a, b, c); }
1110 
1111 #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
1112 OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
1113 OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
1114 OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
1115 inline _Tpvec operator ~ (const _Tpvec& a) \
1116 { \
1117  return _Tpvec(wasm_v128_not(a.val)); \
1118 }
1119 
1120 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint8x16)
1121 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16)
1122 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint16x8)
1123 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int16x8)
1124 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint32x4)
1125 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int32x4)
1126 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint64x2)
1127 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int64x2)
1128 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float32x4)
1129 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float64x2)
1130 
1131 inline v_float32x4 v_sqrt(const v_float32x4& x)
1132 {
1133  return v_float32x4(wasm_f32x4_sqrt(x.val));
1134 }
1135 
1136 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1137 {
1138  const v128_t _1_0 = wasm_f32x4_splat(1.0);
1139  return v_float32x4(wasm_f32x4_div(_1_0, wasm_f32x4_sqrt(x.val)));
1140 }
1141 
1142 inline v_float64x2 v_sqrt(const v_float64x2& x)
1143 {
1144  return v_float64x2(wasm_f64x2_sqrt(x.val));
1145 }
1146 
1147 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1148 {
1149  const v128_t _1_0 = wasm_f64x2_splat(1.0);
1150  return v_float64x2(wasm_f64x2_div(_1_0, wasm_f64x2_sqrt(x.val)));
1151 }
1152 
1153 #define OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(_Tpuvec, _Tpsvec, suffix, zsuffix, shiftWidth) \
1154 inline _Tpuvec v_abs(const _Tpsvec& x) \
1155 { \
1156  v128_t s = wasm_##suffix##_shr(x.val, shiftWidth); \
1157  v128_t f = wasm_##zsuffix##_shr(x.val, shiftWidth); \
1158  return _Tpuvec(wasm_##zsuffix##_add(wasm_v128_xor(x.val, f), s)); \
1159 }
1160 
1161 OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint8x16, v_int8x16, u8x16, i8x16, 7)
1162 OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint16x8, v_int16x8, u16x8, i16x8, 15)
1163 OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint32x4, v_int32x4, u32x4, i32x4, 31)
1164 
1165 inline v_float32x4 v_abs(const v_float32x4& x)
1166 { return v_float32x4(wasm_f32x4_abs(x.val)); }
1167 inline v_float64x2 v_abs(const v_float64x2& x)
1168 {
1169  return v_float64x2(wasm_f64x2_abs(x.val));
1170 }
1171 
1172 // TODO: exp, log, sin, cos
1173 
1174 #define OPENCV_HAL_IMPL_WASM_BIN_FUNC(_Tpvec, func, intrin) \
1175 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1176 { \
1177  return _Tpvec(intrin(a.val, b.val)); \
1178 }
1179 
1180 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min)
1181 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max)
1182 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min)
1183 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_max, wasm_f64x2_max)
1184 
1185 #define OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(_Tpvec, suffix) \
1186 inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1187 { \
1188  return _Tpvec(wasm_v128_bitselect(b.val, a.val, wasm_##suffix##_gt(a.val, b.val))); \
1189 } \
1190 inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1191 { \
1192  return _Tpvec(wasm_v128_bitselect(a.val, b.val, wasm_##suffix##_gt(a.val, b.val))); \
1193 }
1194 
1195 OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int8x16, i8x16)
1196 OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int16x8, i16x8)
1197 OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int32x4, i32x4)
1198 
1199 #define OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(_Tpvec, suffix, deltaNum) \
1200 inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1201 { \
1202  v128_t delta = wasm_##suffix##_splat(deltaNum); \
1203  v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1204  return _Tpvec(wasm_v128_bitselect(b.val, a.val, mask)); \
1205 } \
1206 inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1207 { \
1208  v128_t delta = wasm_##suffix##_splat(deltaNum); \
1209  v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1210  return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask)); \
1211 }
1212 
1213 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint8x16, i8x16, (schar)0x80)
1214 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
1215 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
1216 
1217 #define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
1218 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1219 { return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
1220 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1221 { return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
1222 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1223 { return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
1224 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1225 { return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
1226 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1227 { return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
1228 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1229 { return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
1230 
1231 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
1232 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int8x16, i8x16, i8x16)
1233 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint16x8, u16x8, i16x8)
1234 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int16x8, i16x8, i16x8)
1235 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint32x4, u32x4, i32x4)
1236 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int32x4, i32x4, i32x4)
1237 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
1238 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
1239 
1240 #define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
1241 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1242 { return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
1243 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1244 { return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
1245 
1246 OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
1247 OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
1248 
1249 inline v_float32x4 v_not_nan(const v_float32x4& a)
1250 {
1251  v128_t z = wasm_i32x4_splat(0x7fffffff);
1252  v128_t t = wasm_i32x4_splat(0x7f800000);
1253  return v_float32x4(wasm_u32x4_lt(wasm_v128_and(a.val, z), t));
1254 }
1255 inline v_float64x2 v_not_nan(const v_float64x2& a)
1256 {
1257  v128_t z = wasm_i64x2_splat(0x7fffffffffffffff);
1258  v128_t t = wasm_i64x2_splat(0x7ff0000000000000);
1259  return v_float64x2((__u64x2)(wasm_v128_and(a.val, z)) < (__u64x2)t);
1260 }
1261 
1262 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_add_wrap, wasm_i8x16_add)
1263 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_add_wrap, wasm_i8x16_add)
1264 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_add_wrap, wasm_i16x8_add)
1265 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_add_wrap, wasm_i16x8_add)
1266 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_sub_wrap, wasm_i8x16_sub)
1267 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_sub_wrap, wasm_i8x16_sub)
1268 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
1269 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
1270 #if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012)
1271 // details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
1272 // 1.39.12: https://github.com/emscripten-core/emscripten/commit/cd801d0f110facfd694212a3c8b2ed2ffcd630e2
1273 inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
1274 {
1275  uchar a_[16], b_[16];
1276  wasm_v128_store(a_, a.val);
1277  wasm_v128_store(b_, b.val);
1278  for (int i = 0; i < 16; i++)
1279  a_[i] = (uchar)(a_[i] * b_[i]);
1280  return v_uint8x16(wasm_v128_load(a_));
1281 }
1282 inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
1283 {
1284  schar a_[16], b_[16];
1285  wasm_v128_store(a_, a.val);
1286  wasm_v128_store(b_, b.val);
1287  for (int i = 0; i < 16; i++)
1288  a_[i] = (schar)(a_[i] * b_[i]);
1289  return v_int8x16(wasm_v128_load(a_));
1290 }
1291 #else
1292 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
1293 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
1294 #endif
1295 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_mul_wrap, wasm_i16x8_mul)
1296 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
1297 
1298 
1299 
1301 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1302 { return v_add_wrap(a - b, b - a); }
1303 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1304 { return v_add_wrap(a - b, b - a); }
1305 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1306 { return v_max(a, b) - v_min(a, b); }
1307 
1308 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1309 {
1310  v_int8x16 d = v_sub_wrap(a, b);
1311  v_int8x16 m = a < b;
1312  return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1313 }
1314 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1315 {
1316  return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1317 }
1318 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1319 {
1320  v_int32x4 d = a - b;
1321  v_int32x4 m = a < b;
1322  return v_reinterpret_as_u32((d ^ m) - m);
1323 }
1324 
1326 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1327 {
1328  v_int8x16 d = a - b;
1329  v_int8x16 m = a < b;
1330  return (d ^ m) - m;
1331  }
1332 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1333 { return v_max(a, b) - v_min(a, b); }
1334 
1335 
1336 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1337 {
1338  return a * b + c;
1339 }
1340 
1341 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1342 {
1343  return v_fma(a, b, c);
1344 }
1345 
1346 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1347 {
1348  return a * b + c;
1349 }
1350 
1351 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1352 {
1353  return a * b + c;
1354 }
1355 
1356 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
1357 {
1358  v128_t absmask_vec = wasm_i32x4_splat(0x7fffffff);
1359  return v_float32x4(wasm_v128_and(wasm_f32x4_sub(a.val, b.val), absmask_vec));
1360 }
1361 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
1362 {
1363  v128_t absmask_vec = wasm_u64x2_shr(wasm_i32x4_splat(-1), 1);
1364  return v_float64x2(wasm_v128_and(wasm_f64x2_sub(a.val, b.val), absmask_vec));
1365 }
1366 
1367 #define OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(_Tpvec, suffix) \
1368 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1369 { \
1370  v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1371  v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1372  return _Tpvec(wasm_##suffix##_sqrt(wasm_##suffix##_add(a_Square, b_Square))); \
1373 } \
1374 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1375 { \
1376  v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1377  v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1378  return _Tpvec(wasm_##suffix##_add(a_Square, b_Square)); \
1379 } \
1380 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1381 { \
1382  return _Tpvec(wasm_##suffix##_add(wasm_##suffix##_mul(a.val, b.val), c.val)); \
1383 }
1384 
1385 OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
1386 OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
1387 
1388 #define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
1389 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1390 { \
1391  return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1392 } \
1393 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1394 { \
1395  return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1396 } \
1397 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1398 { \
1399  return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1400 } \
1401 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1402 { \
1403  return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1404 } \
1405 template<int imm> \
1406 inline _Tpuvec v_shl(const _Tpuvec& a) \
1407 { \
1408  return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1409 } \
1410 template<int imm> \
1411 inline _Tpsvec v_shl(const _Tpsvec& a) \
1412 { \
1413  return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1414 } \
1415 template<int imm> \
1416 inline _Tpuvec v_shr(const _Tpuvec& a) \
1417 { \
1418  return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1419 } \
1420 template<int imm> \
1421 inline _Tpsvec v_shr(const _Tpsvec& a) \
1422 { \
1423  return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1424 }
1425 
1426 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint8x16, v_int8x16, i8x16, u8x16)
1427 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint16x8, v_int16x8, i16x8, u16x8)
1428 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint32x4, v_int32x4, i32x4, u32x4)
1429 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint64x2, v_int64x2, i64x2, u64x2)
1430 
1431 namespace hal_wasm_internal
1432 {
1433  template <int imm,
1434  bool is_invalid = ((imm < 0) || (imm > 16)),
1435  bool is_first = (imm == 0),
1436  bool is_second = (imm == 16),
1437  bool is_other = (((imm > 0) && (imm < 16)))>
1438  class v_wasm_palignr_u8_class;
1439 
1440  template <int imm>
1441  class v_wasm_palignr_u8_class<imm, true, false, false, false>;
1442 
1443  template <int imm>
1444  class v_wasm_palignr_u8_class<imm, false, true, false, false>
1445  {
1446  public:
1447  inline v128_t operator()(const v128_t& a, const v128_t&) const
1448  {
1449  return a;
1450  }
1451  };
1452 
1453  template <int imm>
1454  class v_wasm_palignr_u8_class<imm, false, false, true, false>
1455  {
1456  public:
1457  inline v128_t operator()(const v128_t&, const v128_t& b) const
1458  {
1459  return b;
1460  }
1461  };
1462 
1463  template <int imm>
1464  class v_wasm_palignr_u8_class<imm, false, false, false, true>
1465  {
1466  public:
1467  inline v128_t operator()(const v128_t& a, const v128_t& b) const
1468  {
1469  enum { imm2 = (sizeof(v128_t) - imm) };
1470  return wasm_v8x16_shuffle(a, b,
1471  imm, imm+1, imm+2, imm+3,
1472  imm+4, imm+5, imm+6, imm+7,
1473  imm+8, imm+9, imm+10, imm+11,
1474  imm+12, imm+13, imm+14, imm+15);
1475  }
1476  };
1477 
1478  template <int imm>
1479  inline v128_t v_wasm_palignr_u8(const v128_t& a, const v128_t& b)
1480  {
1481  CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_wasm_palignr_u8.");
1482  return v_wasm_palignr_u8_class<imm>()(a, b);
1483  }
1484 }
1485 
1486 template<int imm, typename _Tpvec>
1487 inline _Tpvec v_rotate_right(const _Tpvec &a)
1488 {
1489  using namespace hal_wasm_internal;
1490  enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1491  v128_t z = wasm_i8x16_splat(0);
1492  return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, z));
1493 }
1494 
1495 template<int imm, typename _Tpvec>
1496 inline _Tpvec v_rotate_left(const _Tpvec &a)
1497 {
1498  using namespace hal_wasm_internal;
1499  enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1500  v128_t z = wasm_i8x16_splat(0);
1501  return _Tpvec(v_wasm_palignr_u8<imm2>(z, a.val));
1502 }
1503 
1504 template<int imm, typename _Tpvec>
1505 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1506 {
1507  using namespace hal_wasm_internal;
1508  enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1509  return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, b.val));
1510 }
1511 
1512 template<int imm, typename _Tpvec>
1513 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1514 {
1515  using namespace hal_wasm_internal;
1516  enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1517  return _Tpvec(v_wasm_palignr_u8<imm2>(b.val, a.val));
1518 }
1519 
1520 #define OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1521 inline _Tpvec v_load(const _Tp* ptr) \
1522 { return _Tpvec(wasm_v128_load(ptr)); } \
1523 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1524 { return _Tpvec(wasm_v128_load(ptr)); } \
1525 inline _Tpvec v_load_low(const _Tp* ptr) \
1526 { \
1527  _Tp tmp[_Tpvec::nlanes] = {0}; \
1528  for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1529  tmp[i] = ptr[i]; \
1530  } \
1531  return _Tpvec(wasm_v128_load(tmp)); \
1532 } \
1533 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1534 { \
1535  _Tp tmp[_Tpvec::nlanes]; \
1536  for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1537  tmp[i] = ptr0[i]; \
1538  tmp[i+_Tpvec::nlanes/2] = ptr1[i]; \
1539  } \
1540  return _Tpvec(wasm_v128_load(tmp)); \
1541 } \
1542 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1543 { wasm_v128_store(ptr, a.val); } \
1544 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1545 { wasm_v128_store(ptr, a.val); } \
1546 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1547 { wasm_v128_store(ptr, a.val); } \
1548 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1549 { \
1550  wasm_v128_store(ptr, a.val); \
1551 } \
1552 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1553 { \
1554  _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1555  wasm_v128_store(a_, a.val); \
1556  for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1557  ptr[i] = a_[i]; \
1558 } \
1559 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1560 { \
1561  _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1562  wasm_v128_store(a_, a.val); \
1563  for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1564  ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
1565 }
1566 
1567 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
1568 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int8x16, schar)
1569 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint16x8, ushort)
1570 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int16x8, short)
1571 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1572 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int32x4, int)
1573 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint64x2, uint64)
1574 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int64x2, int64)
1575 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float)
1576 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)
1577 
1578 
1579 
1580 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1581 { return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
1582 
1583 inline v_int8x16 v_reverse(const v_int8x16 &a)
1584 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1585 
1586 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1587 { return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
1588 
1589 inline v_int16x8 v_reverse(const v_int16x8 &a)
1590 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1591 
1592 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1593 { return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
1594 
1595 inline v_int32x4 v_reverse(const v_int32x4 &a)
1596 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1597 
1598 inline v_float32x4 v_reverse(const v_float32x4 &a)
1599 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1600 
1601 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1602 { return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
1603 
1604 inline v_int64x2 v_reverse(const v_int64x2 &a)
1605 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1606 
1607 inline v_float64x2 v_reverse(const v_float64x2 &a)
1608 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1609 
1610 
1611 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1612 inline scalartype v_reduce_sum(const _Tpvec& a) \
1613 { \
1614  regtype val = a.val; \
1615  val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1616  val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
1617  return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1618 }
1619 
1620 OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4, unsigned, v128_t, i32x4, i32x4)
1621 OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_int32x4, int, v128_t, i32x4, i32x4)
1622 OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
1623 
1624 // To do: Optimize v_reduce_sum with wasm intrin.
1625 // Now use fallback implementation as there is no widening op in wasm intrin.
1626 
1627 #define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
1628 inline scalartype v_reduce_sum(const _Tpvec& a) \
1629 { \
1630  _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1631  wasm_v128_store(a_, a.val); \
1632  scalartype c = a_[0]; \
1633  for (int i = 1; i < _Tpvec::nlanes; i++) \
1634  c += a_[i]; \
1635  return c; \
1636 }
1637 
1638 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
1639 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int8x16, int)
1640 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint16x8, unsigned)
1641 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int16x8, int)
1642 
1643 
1644 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1645 inline scalartype v_reduce_sum(const _Tpvec& a) \
1646 { \
1647  regtype val = a.val; \
1648  val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1649  return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1650 }
1651 OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_uint64x2, uint64, v128_t, i64x2, i64x2)
1652 OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_int64x2, int64, v128_t, i64x2, i64x2)
1653 OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_float64x2, double, v128_t, f64x2,f64x2)
1654 
1655 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1656  const v_float32x4& c, const v_float32x4& d)
1657 {
1658  v128_t ac = wasm_f32x4_add(wasm_unpacklo_i32x4(a.val, c.val), wasm_unpackhi_i32x4(a.val, c.val));
1659  v128_t bd = wasm_f32x4_add(wasm_unpacklo_i32x4(b.val, d.val), wasm_unpackhi_i32x4(b.val, d.val));
1660  return v_float32x4(wasm_f32x4_add(wasm_unpacklo_i32x4(ac, bd), wasm_unpackhi_i32x4(ac, bd)));
1661 }
1662 
1663 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP(_Tpvec, scalartype, func, scalar_func) \
1664 inline scalartype v_reduce_##func(const _Tpvec& a) \
1665 { \
1666  scalartype buf[_Tpvec::nlanes]; \
1667  v_store(buf, a); \
1668  scalartype tmp = buf[0]; \
1669  for (int i=1; i<_Tpvec::nlanes; ++i) { \
1670  tmp = scalar_func(tmp, buf[i]); \
1671  } \
1672  return tmp; \
1673 }
1674 
1675 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, max, std::max)
1676 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, min, std::min)
1677 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, max, std::max)
1678 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, min, std::min)
1679 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, max, std::max)
1680 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, min, std::min)
1681 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, max, std::max)
1682 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, min, std::min)
1683 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, max, std::max)
1684 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, min, std::min)
1685 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, max, std::max)
1686 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, min, std::min)
1687 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, max, std::max)
1688 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, min, std::min)
1689 
1690 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1691 {
1692  v_uint16x8 l16, h16;
1693  v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
1694  v_expand(v_absdiff(a, b), l16, h16);
1695  v_expand(l16, l16_l32, l16_h32);
1696  v_expand(h16, h16_l32, h16_h32);
1697  return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
1698 }
1699 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1700 {
1701  v_uint16x8 l16, h16;
1702  v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
1703  v_expand(v_absdiff(a, b), l16, h16);
1704  v_expand(l16, l16_l32, l16_h32);
1705  v_expand(h16, h16_l32, h16_h32);
1706  return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
1707 }
1708 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1709 {
1710  v_uint32x4 l, h;
1711  v_expand(v_absdiff(a, b), l, h);
1712  return v_reduce_sum(l + h);
1713 }
1714 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1715 {
1716  v_uint32x4 l, h;
1717  v_expand(v_absdiff(a, b), l, h);
1718  return v_reduce_sum(l + h);
1719 }
1720 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1721 {
1722  return v_reduce_sum(v_absdiff(a, b));
1723 }
1724 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1725 {
1726  return v_reduce_sum(v_absdiff(a, b));
1727 }
1728 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1729 {
1730  return v_reduce_sum(v_absdiff(a, b));
1731 }
1732 
1733 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1734 {
1735  v128_t m1 = wasm_i32x4_splat(0x55555555);
1736  v128_t m2 = wasm_i32x4_splat(0x33333333);
1737  v128_t m4 = wasm_i32x4_splat(0x0f0f0f0f);
1738  v128_t p = a.val;
1739  p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 1), m1), wasm_v128_and(p, m1));
1740  p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 2), m2), wasm_v128_and(p, m2));
1741  p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 4), m4), wasm_v128_and(p, m4));
1742  return v_uint8x16(p);
1743 }
1744 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1745 {
1746  v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1747  p += v_rotate_right<1>(p);
1748  return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1749 }
1750 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1751 {
1752  v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1753  p += v_rotate_right<1>(p);
1754  p += v_rotate_right<2>(p);
1755  return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1756 }
1757 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1758 {
1759  uint64 a_[2], b_[2] = { 0 };
1760  wasm_v128_store(a_, a.val);
1761  for (int i = 0; i < 16; i++)
1762  b_[i / 8] += popCountTable[((uint8_t*)a_)[i]];
1763  return v_uint64x2(wasm_v128_load(b_));
1764 }
1765 inline v_uint8x16 v_popcount(const v_int8x16& a)
1766 { return v_popcount(v_reinterpret_as_u8(a)); }
1767 inline v_uint16x8 v_popcount(const v_int16x8& a)
1768 { return v_popcount(v_reinterpret_as_u16(a)); }
1769 inline v_uint32x4 v_popcount(const v_int32x4& a)
1770 { return v_popcount(v_reinterpret_as_u32(a)); }
1771 inline v_uint64x2 v_popcount(const v_int64x2& a)
1772 { return v_popcount(v_reinterpret_as_u64(a)); }
1773 
1774 #define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
1775 inline int v_signmask(const _Tpvec& a) \
1776 { \
1777  _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1778  wasm_v128_store(a_, a.val); \
1779  int mask = 0; \
1780  for (int i = 0; i < _Tpvec::nlanes; i++) \
1781  mask |= (reinterpret_int(a_[i]) < 0) << i; \
1782  return mask; \
1783 } \
1784 inline bool v_check_all(const _Tpvec& a) \
1785 { return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
1786 inline bool v_check_any(const _Tpvec& a) \
1787 { return wasm_i8x16_any_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0)));; }
1788 
1789 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16, i8x16, schar)
1790 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int8x16, i8x16, schar)
1791 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint16x8, i16x8, short)
1792 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int16x8, i16x8, short)
1793 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint32x4, i32x4, int)
1794 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int32x4, i32x4, int)
1795 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float32x4, i32x4, float)
1796 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float64x2, f64x2, double)
1797 
1798 #define OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(_Tpvec, suffix, esuffix) \
1799 inline bool v_check_all(const _Tpvec& a) \
1800 { \
1801  v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1802  masked = wasm_i32x4_replace_lane(masked, 0, 0xffffffff); \
1803  masked = wasm_i32x4_replace_lane(masked, 2, 0xffffffff); \
1804  return wasm_i8x16_all_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1805 } \
1806 inline bool v_check_any(const _Tpvec& a) \
1807 { \
1808  v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1809  masked = wasm_i32x4_replace_lane(masked, 0, 0x0); \
1810  masked = wasm_i32x4_replace_lane(masked, 2, 0x0); \
1811  return wasm_i8x16_any_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1812 } \
1813 
1814 OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_int64x2, i32x4, s32)
1815 OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_uint64x2, i32x4, u32)
1816 
1817 
1818 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1819 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
1820 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1821 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
1822 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1823 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1824 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
1825 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1826 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1827 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1828 
1829 #define OPENCV_HAL_IMPL_WASM_SELECT(_Tpvec) \
1830 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1831 { \
1832  return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask.val)); \
1833 }
1834 
1835 OPENCV_HAL_IMPL_WASM_SELECT(v_uint8x16)
1836 OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16)
1837 OPENCV_HAL_IMPL_WASM_SELECT(v_uint16x8)
1838 OPENCV_HAL_IMPL_WASM_SELECT(v_int16x8)
1839 OPENCV_HAL_IMPL_WASM_SELECT(v_uint32x4)
1840 OPENCV_HAL_IMPL_WASM_SELECT(v_int32x4)
1841 OPENCV_HAL_IMPL_WASM_SELECT(v_uint64x2)
1842 OPENCV_HAL_IMPL_WASM_SELECT(v_int64x2)
1843 OPENCV_HAL_IMPL_WASM_SELECT(v_float32x4)
1844 OPENCV_HAL_IMPL_WASM_SELECT(v_float64x2)
1845 
1846 #define OPENCV_HAL_IMPL_WASM_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1847 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1848 { \
1849  b0.val = intrin(a.val); \
1850  b1.val = __CV_CAT(intrin, _high)(a.val); \
1851 } \
1852 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1853 { return _Tpwvec(intrin(a.val)); } \
1854 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1855 { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1856 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1857 { \
1858  v128_t a = wasm_v128_load(ptr); \
1859  return _Tpwvec(intrin(a)); \
1860 }
1861 
1862 OPENCV_HAL_IMPL_WASM_EXPAND(v_uint8x16, v_uint16x8, uchar, v128_cvtu8x16_i16x8)
1863 OPENCV_HAL_IMPL_WASM_EXPAND(v_int8x16, v_int16x8, schar, v128_cvti8x16_i16x8)
1864 OPENCV_HAL_IMPL_WASM_EXPAND(v_uint16x8, v_uint32x4, ushort, v128_cvtu16x8_i32x4)
1865 OPENCV_HAL_IMPL_WASM_EXPAND(v_int16x8, v_int32x4, short, v128_cvti16x8_i32x4)
1866 OPENCV_HAL_IMPL_WASM_EXPAND(v_uint32x4, v_uint64x2, unsigned, v128_cvtu32x4_i64x2)
1867 OPENCV_HAL_IMPL_WASM_EXPAND(v_int32x4, v_int64x2, int, v128_cvti32x4_i64x2)
1868 
1869 #define OPENCV_HAL_IMPL_WASM_EXPAND_Q(_Tpvec, _Tp, intrin) \
1870 inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1871 { \
1872  v128_t a = wasm_v128_load(ptr); \
1873  return _Tpvec(intrin(a)); \
1874 }
1875 
1876 OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_uint32x4, uchar, v128_cvtu8x16_i32x4)
1877 OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_int32x4, schar, v128_cvti8x16_i32x4)
1878 
1879 #define OPENCV_HAL_IMPL_WASM_UNPACKS(_Tpvec, suffix) \
1880 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1881 { \
1882  b0.val = wasm_unpacklo_##suffix(a0.val, a1.val); \
1883  b1.val = wasm_unpackhi_##suffix(a0.val, a1.val); \
1884 } \
1885 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1886 { \
1887  return _Tpvec(wasm_unpacklo_i64x2(a.val, b.val)); \
1888 } \
1889 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1890 { \
1891  return _Tpvec(wasm_unpackhi_i64x2(a.val, b.val)); \
1892 } \
1893 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1894 { \
1895  c.val = wasm_unpacklo_i64x2(a.val, b.val); \
1896  d.val = wasm_unpackhi_i64x2(a.val, b.val); \
1897 }
1898 
1899 OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint8x16, i8x16)
1900 OPENCV_HAL_IMPL_WASM_UNPACKS(v_int8x16, i8x16)
1901 OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint16x8, i16x8)
1902 OPENCV_HAL_IMPL_WASM_UNPACKS(v_int16x8, i16x8)
1903 OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint32x4, i32x4)
1904 OPENCV_HAL_IMPL_WASM_UNPACKS(v_int32x4, i32x4)
1905 OPENCV_HAL_IMPL_WASM_UNPACKS(v_float32x4, i32x4)
1906 OPENCV_HAL_IMPL_WASM_UNPACKS(v_float64x2, i64x2)
1907 
1908 template<int s, typename _Tpvec>
1909 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
1910 {
1911  return v_rotate_right<s>(a, b);
1912 }
1913 
1914 inline v_int32x4 v_round(const v_float32x4& a)
1915 {
1916  v128_t h = wasm_f32x4_splat(0.5);
1917  return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
1918 }
1919 
1920 inline v_int32x4 v_floor(const v_float32x4& a)
1921 {
1922  v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1923  v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
1924  return v_int32x4(wasm_i32x4_add(a1, mask));
1925 }
1926 
1927 inline v_int32x4 v_ceil(const v_float32x4& a)
1928 {
1929  v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1930  v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
1931  return v_int32x4(wasm_i32x4_sub(a1, mask));
1932 }
1933 
1934 inline v_int32x4 v_trunc(const v_float32x4& a)
1935 { return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
1936 
1937 #define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
1938 inline v_int32x4 func(const v_float64x2& a) \
1939 { \
1940  double a_[2]; \
1941  wasm_v128_store(a_, a.val); \
1942  int c_[4]; \
1943  c_[0] = cfunc(a_[0]); \
1944  c_[1] = cfunc(a_[1]); \
1945  c_[2] = 0; \
1946  c_[3] = 0; \
1947  return v_int32x4(wasm_v128_load(c_)); \
1948 }
1949 
1950 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
1951 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
1952 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
1953 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
1954 
1955 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1956 {
1957  double a_[2], b_[2];
1958  wasm_v128_store(a_, a.val);
1959  wasm_v128_store(b_, b.val);
1960  int c_[4];
1961  c_[0] = cvRound(a_[0]);
1962  c_[1] = cvRound(a_[1]);
1963  c_[2] = cvRound(b_[0]);
1964  c_[3] = cvRound(b_[1]);
1965  return v_int32x4(wasm_v128_load(c_));
1966 }
1967 
1968 #define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
1969 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1970  const _Tpvec& a2, const _Tpvec& a3, \
1971  _Tpvec& b0, _Tpvec& b1, \
1972  _Tpvec& b2, _Tpvec& b3) \
1973 { \
1974  v128_t t0 = wasm_unpacklo_##suffix(a0.val, a1.val); \
1975  v128_t t1 = wasm_unpacklo_##suffix(a2.val, a3.val); \
1976  v128_t t2 = wasm_unpackhi_##suffix(a0.val, a1.val); \
1977  v128_t t3 = wasm_unpackhi_##suffix(a2.val, a3.val); \
1978 \
1979  b0.val = wasm_unpacklo_i64x2(t0, t1); \
1980  b1.val = wasm_unpackhi_i64x2(t0, t1); \
1981  b2.val = wasm_unpacklo_i64x2(t2, t3); \
1982  b3.val = wasm_unpackhi_i64x2(t2, t3); \
1983 }
1984 
1985 OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4, i32x4)
1986 OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_int32x4, i32x4)
1987 OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_float32x4, i32x4)
1988 
1989 // load deinterleave
1990 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1991 {
1992  v128_t t00 = wasm_v128_load(ptr);
1993  v128_t t01 = wasm_v128_load(ptr + 16);
1994 
1995  a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
1996  b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
1997 }
1998 
1999 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
2000 {
2001  v128_t t00 = wasm_v128_load(ptr);
2002  v128_t t01 = wasm_v128_load(ptr + 16);
2003  v128_t t02 = wasm_v128_load(ptr + 32);
2004 
2005  v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
2006  v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
2007  v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
2008 
2009  a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
2010  b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
2011  c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
2012 }
2013 
2014 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2015 {
2016  v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
2017  v128_t u1 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
2018  v128_t u2 = wasm_v128_load(ptr + 32); // a8 b8 c8 d8 ...
2019  v128_t u3 = wasm_v128_load(ptr + 48); // a12 b12 c12 d12 ...
2020 
2021  v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2022  v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2023  v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2024  v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2025 
2026  a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2027  b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2028  c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2029  d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2030 }
2031 
2032 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2033 {
2034  v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1 a2 b2 a3 b3
2035  v128_t v1 = wasm_v128_load(ptr + 8); // a4 b4 a5 b5 a6 b6 a7 b7
2036 
2037  a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7
2038  b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7
2039 }
2040 
2041 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2042 {
2043  v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1 b1 c1 a2 b2
2044  v128_t t01 = wasm_v128_load(ptr + 8); // c2 a3 b3 c3 a4 b4 c4 a5
2045  v128_t t02 = wasm_v128_load(ptr + 16); // b5 c5 a6 b6 c6 a7 b7 c7
2046 
2047  v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
2048  v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
2049  v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
2050 
2051  a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
2052  b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
2053  c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
2054 }
2055 
2056 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2057 {
2058  v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1
2059  v128_t u1 = wasm_v128_load(ptr + 8); // a2 b2 c2 d2 ...
2060  v128_t u2 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
2061  v128_t u3 = wasm_v128_load(ptr + 24); // a6 b6 c6 d6 ...
2062 
2063  v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3
2064  v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7
2065  v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3
2066  v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7
2067 
2068  a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2069  b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2070  c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2071  d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2072 }
2073 
2074 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2075 {
2076  v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1
2077  v128_t v1 = wasm_v128_load(ptr + 4); // a2 b2 a3 b3
2078 
2079  a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
2080  b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
2081 }
2082 
2083 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2084 {
2085  v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1
2086  v128_t t01 = wasm_v128_load(ptr + 4); // b2 c2 a3 b3
2087  v128_t t02 = wasm_v128_load(ptr + 8); // c3 a4 b4 c4
2088 
2089  v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2090  v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2091  v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2092 
2093  a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2094  b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2095  c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2096 }
2097 
2098 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2099 {
2100  v_uint32x4 s0(wasm_v128_load(ptr)); // a0 b0 c0 d0
2101  v_uint32x4 s1(wasm_v128_load(ptr + 4)); // a1 b1 c1 d1
2102  v_uint32x4 s2(wasm_v128_load(ptr + 8)); // a2 b2 c2 d2
2103  v_uint32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
2104 
2105  v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2106 }
2107 
2108 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
2109 {
2110  v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1
2111  v128_t v1 = wasm_v128_load((ptr + 4)); // a2 b2 a3 b3
2112 
2113  a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
2114  b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
2115 }
2116 
2117 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
2118 {
2119  v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1
2120  v128_t t01 = wasm_v128_load(ptr + 4); // b2 c2 a3 b3
2121  v128_t t02 = wasm_v128_load(ptr + 8); // c3 a4 b4 c4
2122 
2123  v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2124  v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2125  v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2126 
2127  a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2128  b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2129  c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2130 }
2131 
2132 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
2133 {
2134  v_float32x4 s0(wasm_v128_load(ptr)); // a0 b0 c0 d0
2135  v_float32x4 s1(wasm_v128_load(ptr + 4)); // a1 b1 c1 d1
2136  v_float32x4 s2(wasm_v128_load(ptr + 8)); // a2 b2 c2 d2
2137  v_float32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
2138 
2139  v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2140 }
2141 
2142 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
2143 {
2144  v128_t t0 = wasm_v128_load(ptr); // a0 b0
2145  v128_t t1 = wasm_v128_load(ptr + 2); // a1 b1
2146 
2147  a.val = wasm_unpacklo_i64x2(t0, t1);
2148  b.val = wasm_unpackhi_i64x2(t0, t1);
2149 }
2150 
2151 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2152 {
2153  v128_t t0 = wasm_v128_load(ptr); // a0, b0
2154  v128_t t1 = wasm_v128_load(ptr + 2); // c0, a1
2155  v128_t t2 = wasm_v128_load(ptr + 4); // b1, c1
2156 
2157  a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2158  b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
2159  c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2160 }
2161 
2162 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
2163  v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2164 {
2165  v128_t t0 = wasm_v128_load(ptr); // a0 b0
2166  v128_t t1 = wasm_v128_load(ptr + 2); // c0 d0
2167  v128_t t2 = wasm_v128_load(ptr + 4); // a1 b1
2168  v128_t t3 = wasm_v128_load(ptr + 6); // c1 d1
2169 
2170  a.val = wasm_unpacklo_i64x2(t0, t2);
2171  b.val = wasm_unpackhi_i64x2(t0, t2);
2172  c.val = wasm_unpacklo_i64x2(t1, t3);
2173  d.val = wasm_unpackhi_i64x2(t1, t3);
2174 }
2175 
2176 // store interleave
2177 
2178 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2180 {
2181  v128_t v0 = wasm_unpacklo_i8x16(a.val, b.val);
2182  v128_t v1 = wasm_unpackhi_i8x16(a.val, b.val);
2183 
2184  wasm_v128_store(ptr, v0);
2185  wasm_v128_store(ptr + 16, v1);
2186 }
2187 
2188 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2189  const v_uint8x16& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2190 {
2191  v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
2192  v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
2193  v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
2194 
2195  v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
2196  v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
2197  v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
2198 
2199  wasm_v128_store(ptr, t10);
2200  wasm_v128_store(ptr + 16, t11);
2201  wasm_v128_store(ptr + 32, t12);
2202 }
2203 
2204 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2205  const v_uint8x16& c, const v_uint8x16& d,
2207 {
2208  // a0 a1 a2 a3 ....
2209  // b0 b1 b2 b3 ....
2210  // c0 c1 c2 c3 ....
2211  // d0 d1 d2 d3 ....
2212  v128_t u0 = wasm_unpacklo_i8x16(a.val, c.val); // a0 c0 a1 c1 ...
2213  v128_t u1 = wasm_unpackhi_i8x16(a.val, c.val); // a8 c8 a9 c9 ...
2214  v128_t u2 = wasm_unpacklo_i8x16(b.val, d.val); // b0 d0 b1 d1 ...
2215  v128_t u3 = wasm_unpackhi_i8x16(b.val, d.val); // b8 d8 b9 d9 ...
2216 
2217  v128_t v0 = wasm_unpacklo_i8x16(u0, u2); // a0 b0 c0 d0 ...
2218  v128_t v1 = wasm_unpackhi_i8x16(u0, u2); // a4 b4 c4 d4 ...
2219  v128_t v2 = wasm_unpacklo_i8x16(u1, u3); // a8 b8 c8 d8 ...
2220  v128_t v3 = wasm_unpackhi_i8x16(u1, u3); // a12 b12 c12 d12 ...
2221 
2222  wasm_v128_store(ptr, v0);
2223  wasm_v128_store(ptr + 16, v1);
2224  wasm_v128_store(ptr + 32, v2);
2225  wasm_v128_store(ptr + 48, v3);
2226 }
2227 
2228 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2230 {
2231  v128_t v0 = wasm_unpacklo_i16x8(a.val, b.val);
2232  v128_t v1 = wasm_unpackhi_i16x8(a.val, b.val);
2233 
2234  wasm_v128_store(ptr, v0);
2235  wasm_v128_store(ptr + 8, v1);
2236 }
2237 
2238 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2239  const v_uint16x8& b, const v_uint16x8& c,
2241 {
2242  v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
2243  v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
2244  v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
2245 
2246  v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
2247  v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
2248  v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
2249 
2250  wasm_v128_store(ptr, t10);
2251  wasm_v128_store(ptr + 8, t11);
2252  wasm_v128_store(ptr + 16, t12);
2253 }
2254 
2255 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2256  const v_uint16x8& c, const v_uint16x8& d,
2258 {
2259  // a0 a1 a2 a3 ....
2260  // b0 b1 b2 b3 ....
2261  // c0 c1 c2 c3 ....
2262  // d0 d1 d2 d3 ....
2263  v128_t u0 = wasm_unpacklo_i16x8(a.val, c.val); // a0 c0 a1 c1 ...
2264  v128_t u1 = wasm_unpackhi_i16x8(a.val, c.val); // a4 c4 a5 c5 ...
2265  v128_t u2 = wasm_unpacklo_i16x8(b.val, d.val); // b0 d0 b1 d1 ...
2266  v128_t u3 = wasm_unpackhi_i16x8(b.val, d.val); // b4 d4 b5 d5 ...
2267 
2268  v128_t v0 = wasm_unpacklo_i16x8(u0, u2); // a0 b0 c0 d0 ...
2269  v128_t v1 = wasm_unpackhi_i16x8(u0, u2); // a2 b2 c2 d2 ...
2270  v128_t v2 = wasm_unpacklo_i16x8(u1, u3); // a4 b4 c4 d4 ...
2271  v128_t v3 = wasm_unpackhi_i16x8(u1, u3); // a6 b6 c6 d6 ...
2272 
2273  wasm_v128_store(ptr, v0);
2274  wasm_v128_store(ptr + 8, v1);
2275  wasm_v128_store(ptr + 16, v2);
2276  wasm_v128_store(ptr + 24, v3);
2277 }
2278 
2279 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2281 {
2282  v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2283  v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2284 
2285  wasm_v128_store(ptr, v0);
2286  wasm_v128_store(ptr + 4, v1);
2287 }
2288 
2289 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2290  const v_uint32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2291 {
2292  v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2293  v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2294  v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2295 
2296  v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2297  v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2298  v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2299 
2300  wasm_v128_store(ptr, t10);
2301  wasm_v128_store(ptr + 4, t11);
2302  wasm_v128_store(ptr + 8, t12);
2303 }
2304 
2305 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2306  const v_uint32x4& c, const v_uint32x4& d,
2308 {
2309  v_uint32x4 v0, v1, v2, v3;
2310  v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2311 
2312  wasm_v128_store(ptr, v0.val);
2313  wasm_v128_store(ptr + 4, v1.val);
2314  wasm_v128_store(ptr + 8, v2.val);
2315  wasm_v128_store(ptr + 12, v3.val);
2316 }
2317 
2318 // 2-channel, float only
2319 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2321 {
2322  v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2323  v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2324 
2325  wasm_v128_store(ptr, v0);
2326  wasm_v128_store(ptr + 4, v1);
2327 }
2328 
2329 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2330  const v_float32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2331 {
2332  v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2333  v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2334  v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2335 
2336  v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2337  v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2338  v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2339 
2340  wasm_v128_store(ptr, t10);
2341  wasm_v128_store(ptr + 4, t11);
2342  wasm_v128_store(ptr + 8, t12);
2343 }
2344 
2345 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2346  const v_float32x4& c, const v_float32x4& d,
2348 {
2349  v_float32x4 v0, v1, v2, v3;
2350  v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2351 
2352  wasm_v128_store(ptr, v0.val);
2353  wasm_v128_store(ptr + 4, v1.val);
2354  wasm_v128_store(ptr + 8, v2.val);
2355  wasm_v128_store(ptr + 12, v3.val);
2356 }
2357 
2358 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2360 {
2361  v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2362  v128_t v1 = wasm_unpackhi_i64x2(a.val, b.val);
2363 
2364  wasm_v128_store(ptr, v0);
2365  wasm_v128_store(ptr + 2, v1);
2366 }
2367 
2368 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2369  const v_uint64x2& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2370 {
2371  v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2372  v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
2373  v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2374 
2375  wasm_v128_store(ptr, v0);
2376  wasm_v128_store(ptr + 2, v1);
2377  wasm_v128_store(ptr + 4, v2);
2378 }
2379 
2380 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2381  const v_uint64x2& c, const v_uint64x2& d,
2383 {
2384  v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2385  v128_t v1 = wasm_unpacklo_i64x2(c.val, d.val);
2386  v128_t v2 = wasm_unpackhi_i64x2(a.val, b.val);
2387  v128_t v3 = wasm_unpackhi_i64x2(c.val, d.val);
2388 
2389  wasm_v128_store(ptr, v0);
2390  wasm_v128_store(ptr + 2, v1);
2391  wasm_v128_store(ptr + 4, v2);
2392  wasm_v128_store(ptr + 6, v3);
2393 }
2394 
2395 #define OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2396 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2397 { \
2398  _Tpvec1 a1, b1; \
2399  v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2400  a0 = v_reinterpret_as_##suffix0(a1); \
2401  b0 = v_reinterpret_as_##suffix0(b1); \
2402 } \
2403 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2404 { \
2405  _Tpvec1 a1, b1, c1; \
2406  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2407  a0 = v_reinterpret_as_##suffix0(a1); \
2408  b0 = v_reinterpret_as_##suffix0(b1); \
2409  c0 = v_reinterpret_as_##suffix0(c1); \
2410 } \
2411 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2412 { \
2413  _Tpvec1 a1, b1, c1, d1; \
2414  v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2415  a0 = v_reinterpret_as_##suffix0(a1); \
2416  b0 = v_reinterpret_as_##suffix0(b1); \
2417  c0 = v_reinterpret_as_##suffix0(c1); \
2418  d0 = v_reinterpret_as_##suffix0(d1); \
2419 } \
2420 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2421  hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2422 { \
2423  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2424  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2425  v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2426 } \
2427 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2428  const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2429 { \
2430  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2431  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2432  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2433  v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2434 } \
2435 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2436  const _Tpvec0& c0, const _Tpvec0& d0, \
2437  hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2438 { \
2439  _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2440  _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2441  _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2442  _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2443  v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2444 }
2445 
2446 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2447 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2448 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2449 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2450 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2451 
2452 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2453 {
2454  return v_float32x4(wasm_f32x4_convert_i32x4(a.val));
2455 }
2456 
2457 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2458 {
2459  double a_[2];
2460  wasm_v128_store(a_, a.val);
2461  float c_[4];
2462  c_[0] = (float)(a_[0]);
2463  c_[1] = (float)(a_[1]);
2464  c_[2] = 0;
2465  c_[3] = 0;
2466  return v_float32x4(wasm_v128_load(c_));
2467 }
2468 
2469 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2470 {
2471  double a_[2], b_[2];
2472  wasm_v128_store(a_, a.val);
2473  wasm_v128_store(b_, b.val);
2474  float c_[4];
2475  c_[0] = (float)(a_[0]);
2476  c_[1] = (float)(a_[1]);
2477  c_[2] = (float)(b_[0]);
2478  c_[3] = (float)(b_[1]);
2479  return v_float32x4(wasm_v128_load(c_));
2480 }
2481 
2482 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2483 {
2484 #ifdef __wasm_unimplemented_simd128__
2485  v128_t p = v128_cvti32x4_i64x2(a.val);
2486  return v_float64x2(wasm_f64x2_convert_i64x2(p));
2487 #else
2488  int a_[4];
2489  wasm_v128_store(a_, a.val);
2490  double c_[2];
2491  c_[0] = (double)(a_[0]);
2492  c_[1] = (double)(a_[1]);
2493  return v_float64x2(wasm_v128_load(c_));
2494 #endif
2495 }
2496 
2497 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2498 {
2499 #ifdef __wasm_unimplemented_simd128__
2500  v128_t p = v128_cvti32x4_i64x2_high(a.val);
2501  return v_float64x2(wasm_f64x2_convert_i64x2(p));
2502 #else
2503  int a_[4];
2504  wasm_v128_store(a_, a.val);
2505  double c_[2];
2506  c_[0] = (double)(a_[2]);
2507  c_[1] = (double)(a_[3]);
2508  return v_float64x2(wasm_v128_load(c_));
2509 #endif
2510 }
2511 
2512 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2513 {
2514  float a_[4];
2515  wasm_v128_store(a_, a.val);
2516  double c_[2];
2517  c_[0] = (double)(a_[0]);
2518  c_[1] = (double)(a_[1]);
2519  return v_float64x2(wasm_v128_load(c_));
2520 }
2521 
2522 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2523 {
2524  float a_[4];
2525  wasm_v128_store(a_, a.val);
2526  double c_[2];
2527  c_[0] = (double)(a_[2]);
2528  c_[1] = (double)(a_[3]);
2529  return v_float64x2(wasm_v128_load(c_));
2530 }
2531 
2532 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2533 {
2534 #ifdef __wasm_unimplemented_simd128__
2535  return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
2536 #else
2537  int64 a_[2];
2538  wasm_v128_store(a_, a.val);
2539  double c_[2];
2540  c_[0] = (double)(a_[0]);
2541  c_[1] = (double)(a_[1]);
2542  return v_float64x2(wasm_v128_load(c_));
2543 #endif
2544 }
2545 
2547 
2548 inline v_int8x16 v_lut(const schar* tab, const int* idx)
2549 {
2550  return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
2551  tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
2552 }
2553 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
2554 {
2555  return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1],
2556  tab[idx[4]], tab[idx[4]+1], tab[idx[5]], tab[idx[5]+1], tab[idx[6]], tab[idx[6]+1], tab[idx[7]], tab[idx[7]+1]);
2557 }
2558 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2559 {
2560  return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3],
2561  tab[idx[2]], tab[idx[2]+1], tab[idx[2]+2], tab[idx[2]+3], tab[idx[3]], tab[idx[3]+1], tab[idx[3]+2], tab[idx[3]+3]);
2562 }
2563 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
2564 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
2565 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
2566 
2567 inline v_int16x8 v_lut(const short* tab, const int* idx)
2568 {
2569  return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
2570  tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
2571 }
2572 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2573 {
2574  return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1],
2575  tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1]);
2576 }
2577 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2578 {
2579  return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3],
2580  tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3]);
2581 }
2582 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
2583 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
2584 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
2585 
2586 inline v_int32x4 v_lut(const int* tab, const int* idx)
2587 {
2588  return v_int32x4(tab[idx[0]], tab[idx[1]],
2589  tab[idx[2]], tab[idx[3]]);
2590 }
2591 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2592 {
2593  return v_int32x4(tab[idx[0]], tab[idx[0]+1],
2594  tab[idx[1]], tab[idx[1]+1]);
2595 }
2596 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2597 {
2598  return v_int32x4(wasm_v128_load(tab + idx[0]));
2599 }
2600 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
2601 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
2602 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
2603 
2604 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2605 {
2606  return v_int64x2(tab[idx[0]], tab[idx[1]]);
2607 }
2608 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2609 {
2610  return v_int64x2(wasm_v128_load(tab + idx[0]));
2611 }
2612 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
2613 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2614 
2615 inline v_float32x4 v_lut(const float* tab, const int* idx)
2616 {
2617  return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2618 }
2619 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
2620 inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
2621 
2622 inline v_float64x2 v_lut(const double* tab, const int* idx)
2623 {
2624  return v_float64x2(tab[idx[0]], tab[idx[1]]);
2625 }
2626 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2627 {
2628  return v_float64x2(wasm_v128_load(tab + idx[0]));
2629 }
2630 
2631 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2632 {
2633  return v_int32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2634  tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2635  tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2636  tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2637 }
2638 
2639 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2640 {
2641  return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
2642 }
2643 
2644 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2645 {
2646  return v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2647  tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2648  tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2649  tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2650 }
2651 
2652 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2653 {
2654  return v_float64x2(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2655  tab[wasm_i32x4_extract_lane(idxvec.val, 1)]);
2656 }
2657 
2658 // loads pairs from the table and deinterleaves them, e.g. returns:
2659 // x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
2660 // y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
2661 // note that the indices are float's indices, not the float-pair indices.
2662 // in theory, this function can be used to implement bilinear interpolation,
2663 // when idxvec are the offsets within the image.
2664 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2665 {
2666  x = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2667  tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2668  tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2669  tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2670  y = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)+1],
2671  tab[wasm_i32x4_extract_lane(idxvec.val, 1)+1],
2672  tab[wasm_i32x4_extract_lane(idxvec.val, 2)+1],
2673  tab[wasm_i32x4_extract_lane(idxvec.val, 3)+1]);
2674 }
2675 
2676 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2677 {
2678  v128_t xy0 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 0));
2679  v128_t xy1 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 1));
2680  x.val = wasm_unpacklo_i64x2(xy0, xy1);
2681  y.val = wasm_unpacklo_i64x2(xy0, xy1);
2682 }
2683 
2684 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2685 {
2686  return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
2687 }
2688 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
2689 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2690 {
2691  return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
2692 }
2693 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2694 
2695 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2696 {
2697  return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
2698 }
2699 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2700 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2701 {
2702  return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
2703 }
2704 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2705 
2706 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2707 {
2708  return v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2709 }
2710 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2711 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
2712 {
2713  return v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2714 }
2715 
2716 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2717 {
2718  return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
2719 }
2720 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2721 
2722 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2723 {
2724  return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
2725 }
2726 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2727 
2728 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2729 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2730 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2731 
2732 template<int i, typename _Tp>
2733 inline typename _Tp::lane_type v_extract_n(const _Tp& a)
2734 {
2735  return v_rotate_right<i>(a).get0();
2736 }
2737 
2738 template<int i>
2739 inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
2740 {
2741  return v_setall_u32(v_extract_n<i>(a));
2742 }
2743 template<int i>
2744 inline v_int32x4 v_broadcast_element(const v_int32x4& a)
2745 {
2746  return v_setall_s32(v_extract_n<i>(a));
2747 }
2748 template<int i>
2750 {
2751  return v_setall_f32(v_extract_n<i>(a));
2752 }
2753 
2754 
2756 
2757 inline v_float32x4 v_load_expand(const hfloat* ptr)
2758 {
2759  float a[4];
2760  for (int i = 0; i < 4; i++)
2761  a[i] = ptr[i];
2762  return v_float32x4(wasm_v128_load(a));
2763 }
2764 
2765 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2766 {
2767  double v_[4];
2768  wasm_v128_store(v_, v.val);
2769  ptr[0] = hfloat(v_[0]);
2770  ptr[1] = hfloat(v_[1]);
2771  ptr[2] = hfloat(v_[2]);
2772  ptr[3] = hfloat(v_[3]);
2773 }
2774 
2775 inline void v_cleanup() {}
2776 
2777 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2778 
2780 
2781 }
2782 
2783 #endif
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition: intrin_cpp.hpp:2413
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition: intrin_cpp.hpp:1474
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition: intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition: intrin_cpp.hpp:2761
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< _Tp, n > v_extract(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Vector extract.
Definition: intrin_cpp.hpp:2371
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
CV_INLINE int cvRound(double value)
Rounds floating-point number to the nearest integer.
Definition: fast_math.hpp:200
CV_INLINE int cvCeil(double value)
Rounds floating-point number to the nearest integer not smaller than the original.
Definition: fast_math.hpp:258
CV_INLINE int cvFloor(double value)
Rounds floating-point number to the nearest integer not larger than the original.
Definition: fast_math.hpp:231
CvRect r
Definition: imgproc_c.h:984
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
T max(T... args)
T min(T... args)
StoreMode
Definition: intrin.hpp:100
@ STORE_UNALIGNED
Definition: intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437