EstervQrCode 1.1.1
Library for qr code manipulation
intrin_rvv.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 // The original implementation has been contributed by Yin Zhang.
6 // Copyright (C) 2020, Institute of Software, Chinese Academy of Sciences.
7 
8 #ifndef OPENCV_HAL_INTRIN_RVV_HPP
9 #define OPENCV_HAL_INTRIN_RVV_HPP
10 
11 #include <algorithm>
12 
13 // RVV intrinsics have been renamed in version 0.11, so we need to include
14 // compatibility headers:
15 // https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
16 #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>10999
17 #include "intrin_rvv_010_compat_non-policy.hpp"
18 #include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
19 #endif
20 
21 
22 // Building for T-Head C906 core with RVV 0.7.1 using toolchain
23 // https://github.com/T-head-Semi/xuantie-gnu-toolchain
24 // with option '-march=rv64gcv0p7'
25 #ifdef __THEAD_VERSION__
26 # if __riscv_v == 7000
27 # include <fenv.h>
28 # define CV_RVV_THEAD_0_7
29 # endif
30 #endif
31 
32 namespace cv
33 {
34 
36 
37 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
38 
39 #define CV_SIMD128 1
40 #ifndef CV_RVV_THEAD_0_7
41 # define CV_SIMD128_64F 1
42 #else
43 # define CV_SIMD128_64F 0
44 #endif
45 
47 // The following types have been defined in clang, but not in GCC yet.
48 #ifndef __clang__
49 
50 struct vuint8mf2_t
51 {
52  uchar val[8] = {0};
53  vuint8mf2_t() {}
54  vuint8mf2_t(const uchar* ptr)
55  {
56  for (int i = 0; i < 8; ++i)
57  {
58  val[i] = ptr[i];
59  }
60  }
61 };
62 struct vint8mf2_t
63 {
64  schar val[8] = {0};
65  vint8mf2_t() {}
66  vint8mf2_t(const schar* ptr)
67  {
68  for (int i = 0; i < 8; ++i)
69  {
70  val[i] = ptr[i];
71  }
72  }
73 };
74 struct vuint16mf2_t
75 {
76  ushort val[4] = {0};
77  vuint16mf2_t() {}
78  vuint16mf2_t(const ushort* ptr)
79  {
80  for (int i = 0; i < 4; ++i)
81  {
82  val[i] = ptr[i];
83  }
84  }
85 };
86 struct vint16mf2_t
87 {
88  short val[4] = {0};
89  vint16mf2_t() {}
90  vint16mf2_t(const short* ptr)
91  {
92  for (int i = 0; i < 4; ++i)
93  {
94  val[i] = ptr[i];
95  }
96  }
97 };
98 struct vuint32mf2_t
99 {
100  unsigned val[2] = {0};
101  vuint32mf2_t() {}
102  vuint32mf2_t(const unsigned* ptr)
103  {
104  val[0] = ptr[0];
105  val[1] = ptr[1];
106  }
107 };
108 struct vint32mf2_t
109 {
110  int val[2] = {0};
111  vint32mf2_t() {}
112  vint32mf2_t(const int* ptr)
113  {
114  val[0] = ptr[0];
115  val[1] = ptr[1];
116  }
117 };
118 struct vfloat32mf2_t
119 {
120  float val[2] = {0};
121  vfloat32mf2_t() {}
122  vfloat32mf2_t(const float* ptr)
123  {
124  val[0] = ptr[0];
125  val[1] = ptr[1];
126  }
127 };
128 struct vuint64mf2_t
129 {
130  uint64 val[1] = {0};
131  vuint64mf2_t() {}
132  vuint64mf2_t(const uint64* ptr)
133  {
134  val[0] = ptr[0];
135  }
136 };
137 struct vint64mf2_t
138 {
139  int64 val[1] = {0};
140  vint64mf2_t() {}
141  vint64mf2_t(const int64* ptr)
142  {
143  val[0] = ptr[0];
144  }
145 };
146 struct vfloat64mf2_t
147 {
148  double val[1] = {0};
149  vfloat64mf2_t() {}
150  vfloat64mf2_t(const double* ptr)
151  {
152  val[0] = ptr[0];
153  }
154 };
155 struct vuint8mf4_t
156 {
157  uchar val[4] = {0};
158  vuint8mf4_t() {}
159  vuint8mf4_t(const uchar* ptr)
160  {
161  for (int i = 0; i < 4; ++i)
162  {
163  val[i] = ptr[i];
164  }
165  }
166 };
167 struct vint8mf4_t
168 {
169  schar val[4] = {0};
170  vint8mf4_t() {}
171  vint8mf4_t(const schar* ptr)
172  {
173  for (int i = 0; i < 4; ++i)
174  {
175  val[i] = ptr[i];
176  }
177  }
178 };
179 
180 #define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
181 inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr, size_t vl) \
182 { \
183  CV_UNUSED(vl); \
184  return _Tpvec(ptr); \
185 } \
186 inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v, size_t vl) \
187 { \
188  CV_UNUSED(vl); \
189  for (int i = 0; i < n; ++i) \
190  { \
191  ptr[i] = v.val[i]; \
192  } \
193 }
194 
195 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
196 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
197 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
198 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
199 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
200 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
201 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
202 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
203 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
204 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
205 
206 
207 #define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
208 inline _Tpwvec wcvt (_Tpvec v, size_t vl) \
209 { \
210  _wTp tmp[n]; \
211  for (int i = 0; i < n; ++i) \
212  { \
213  tmp[i] = (_wTp)v.val[i]; \
214  } \
215  return vle##width##_v_##suffix##m1(tmp, vl); \
216 }
217 
218 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
219 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t, short, vwcvt_x_x_v_i16m1, i16, 16, 8)
220 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t, unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
221 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1, i32, 32, 4)
222 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
223 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
224 
225 inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base, size_t vl)
226 {
227  CV_UNUSED(vl);
228  return vuint8mf4_t(base);
229 }
230 inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base, size_t vl)
231 {
232  CV_UNUSED(vl);
233  return vint8mf4_t(base);
234 }
235 
236 inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src, size_t vl)
237 {
238  ushort tmp[4];
239  for (int i = 0; i < 4; ++i)
240  {
241  tmp[i] = (ushort)src.val[i];
242  }
243  return vle16_v_u16mf2(tmp, vl);
244 }
245 inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src, size_t vl)
246 {
247  short tmp[4];
248  for (int i = 0; i < 4; ++i)
249  {
250  tmp[i] = (short)src.val[i];
251  }
252  return vle16_v_i16mf2(tmp, vl);
253 }
254 #endif
255 
257 
258 #ifndef __clang__
259 struct v_uint8x16
260 {
261  typedef uchar lane_type;
262  enum { nlanes = 16 };
263 
264  v_uint8x16() {}
265  explicit v_uint8x16(vuint8m1_t v)
266  {
267  vse8_v_u8m1(val, v, nlanes);
268  }
269  v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
270  uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
271  {
272  uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
273  for (int i = 0; i < nlanes; ++i)
274  {
275  val[i] = v[i];
276  }
277  }
278  operator vuint8m1_t() const
279  {
280  return vle8_v_u8m1(val, nlanes);
281  }
282  uchar get0() const
283  {
284  return val[0];
285  }
286 
287  uchar val[16];
288 };
289 
290 struct v_int8x16
291 {
292  typedef schar lane_type;
293  enum { nlanes = 16 };
294 
295  v_int8x16() {}
296  explicit v_int8x16(vint8m1_t v)
297  {
298  vse8_v_i8m1(val, v, nlanes);
299  }
300  v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
301  schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
302  {
303  schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
304  for (int i = 0; i < nlanes; ++i)
305  {
306  val[i] = v[i];
307  }
308  }
309  operator vint8m1_t() const
310  {
311  return vle8_v_i8m1(val, nlanes);
312  }
313  schar get0() const
314  {
315  return val[0];
316  }
317 
318  schar val[16];
319 };
320 
321 struct v_uint16x8
322 {
323  typedef ushort lane_type;
324  enum { nlanes = 8 };
325 
326  v_uint16x8() {}
327  explicit v_uint16x8(vuint16m1_t v)
328  {
329  vse16_v_u16m1(val, v, nlanes);
330  }
331  v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
332  {
333  ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
334  for (int i = 0; i < nlanes; ++i)
335  {
336  val[i] = v[i];
337  }
338  }
339  operator vuint16m1_t() const
340  {
341  return vle16_v_u16m1(val, nlanes);
342  }
343  ushort get0() const
344  {
345  return val[0];
346  }
347 
348  ushort val[8];
349 };
350 
351 struct v_int16x8
352 {
353  typedef short lane_type;
354  enum { nlanes = 8 };
355 
356  v_int16x8() {}
357  explicit v_int16x8(vint16m1_t v)
358  {
359  vse16_v_i16m1(val, v, nlanes);
360  }
361  v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
362  {
363  short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
364  for (int i = 0; i < nlanes; ++i)
365  {
366  val[i] = v[i];
367  }
368  }
369  operator vint16m1_t() const
370  {
371  return vle16_v_i16m1(val, nlanes);
372  }
373  short get0() const
374  {
375  return val[0];
376  }
377 
378  short val[8];
379 };
380 
381 struct v_uint32x4
382 {
383  typedef unsigned lane_type;
384  enum { nlanes = 4 };
385 
386  v_uint32x4() {}
387  explicit v_uint32x4(vuint32m1_t v)
388  {
389  vse32_v_u32m1(val, v, nlanes);
390  }
391  v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
392  {
393  unsigned v[] = {v0, v1, v2, v3};
394  for (int i = 0; i < nlanes; ++i)
395  {
396  val[i] = v[i];
397  }
398  }
399  operator vuint32m1_t() const
400  {
401  return vle32_v_u32m1(val, nlanes);
402  }
403  unsigned get0() const
404  {
405  return val[0];
406  }
407 
408  unsigned val[4];
409 };
410 
411 struct v_int32x4
412 {
413  typedef int lane_type;
414  enum { nlanes = 4 };
415 
416  v_int32x4() {}
417  explicit v_int32x4(vint32m1_t v)
418  {
419  vse32_v_i32m1(val, v, nlanes);
420  }
421  v_int32x4(int v0, int v1, int v2, int v3)
422  {
423  int v[] = {v0, v1, v2, v3};
424  for (int i = 0; i < nlanes; ++i)
425  {
426  val[i] = v[i];
427  }
428  }
429  operator vint32m1_t() const
430  {
431  return vle32_v_i32m1(val, nlanes);
432  }
433  int get0() const
434  {
435  return val[0];
436  }
437  int val[4];
438 };
439 
440 struct v_float32x4
441 {
442  typedef float lane_type;
443  enum { nlanes = 4 };
444 
445  v_float32x4() {}
446  explicit v_float32x4(vfloat32m1_t v)
447  {
448  vse32_v_f32m1(val, v, nlanes);
449  }
450  v_float32x4(float v0, float v1, float v2, float v3)
451  {
452  float v[] = {v0, v1, v2, v3};
453  for (int i = 0; i < nlanes; ++i)
454  {
455  val[i] = v[i];
456  }
457  }
458  operator vfloat32m1_t() const
459  {
460  return vle32_v_f32m1(val, nlanes);
461  }
462  float get0() const
463  {
464  return val[0];
465  }
466  float val[4];
467 };
468 
469 struct v_uint64x2
470 {
471  typedef uint64 lane_type;
472  enum { nlanes = 2 };
473 
474  v_uint64x2() {}
475  explicit v_uint64x2(vuint64m1_t v)
476  {
477  vse64_v_u64m1(val, v, nlanes);
478  }
479  v_uint64x2(uint64 v0, uint64 v1)
480  {
481  uint64 v[] = {v0, v1};
482  for (int i = 0; i < nlanes; ++i)
483  {
484  val[i] = v[i];
485  }
486  }
487  operator vuint64m1_t() const
488  {
489  return vle64_v_u64m1(val, nlanes);
490  }
491  uint64 get0() const
492  {
493  return val[0];
494  }
495 
496  uint64 val[2];
497 };
498 
499 struct v_int64x2
500 {
501  typedef int64 lane_type;
502  enum { nlanes = 2 };
503 
504  v_int64x2() {}
505  explicit v_int64x2(vint64m1_t v)
506  {
507  vse64_v_i64m1(val, v, nlanes);
508  }
509  v_int64x2(int64 v0, int64 v1)
510  {
511  int64 v[] = {v0, v1};
512  for (int i = 0; i < nlanes; ++i)
513  {
514  val[i] = v[i];
515  }
516  }
517  operator vint64m1_t() const
518  {
519  return vle64_v_i64m1(val, nlanes);
520  }
521  int64 get0() const
522  {
523  return val[0];
524  }
525 
526  int64 val[2];
527 };
528 
529 #if CV_SIMD128_64F
530 struct v_float64x2
531 {
532  typedef double lane_type;
533  enum { nlanes = 2 };
534 
535  v_float64x2() {}
536  explicit v_float64x2(vfloat64m1_t v)
537  {
538  vse64_v_f64m1(val, v, nlanes);
539  }
540  v_float64x2(double v0, double v1)
541  {
542  double v[] = {v0, v1};
543  for (int i = 0; i < nlanes; ++i)
544  {
545  val[i] = v[i];
546  }
547  }
548  operator vfloat64m1_t() const
549  {
550  return vle64_v_f64m1(val, nlanes);
551  }
552  double get0() const
553  {
554  return val[0];
555  }
556 
557  double val[2];
558 };
559 #endif
560 #else
561 struct v_uint8x16
562 {
563  typedef uchar lane_type;
564  enum { nlanes = 16 };
565 
566  v_uint8x16() {}
567  explicit v_uint8x16(vuint8m1_t v)
568  {
569  *pval = v;
570  }
571  v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
572  uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
573  {
574  uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
575  *pval = vle8_v_u8m1(v, nlanes);
576  }
577  operator vuint8m1_t() const
578  {
579  return *pval;
580  }
581  uchar get0() const
582  {
583  return vmv_x(*pval);
584  }
585  inline v_uint8x16& operator=(const v_uint8x16& vec) {
586  *pval = *(vec.pval);
587  return *this;
588  }
589  inline v_uint8x16(const v_uint8x16& vec) {
590  *pval = *(vec.pval);
591  }
592  uchar val[16];
593  vuint8m1_t* pval = (vuint8m1_t*)val;
594 };
595 
596 struct v_int8x16
597 {
598  typedef schar lane_type;
599  enum { nlanes = 16 };
600 
601  v_int8x16() {}
602  explicit v_int8x16(vint8m1_t v)
603  {
604  *pval = v;
605  }
606  v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
607  schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
608  {
609  schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
610  *pval = vle8_v_i8m1(v, nlanes);
611  }
612  operator vint8m1_t() const
613  {
614  return *pval;
615  }
616  schar get0() const
617  {
618  return vmv_x(*pval);
619  }
620  inline v_int8x16& operator=(const v_int8x16& vec) {
621  *pval = *(vec.pval);
622  return *this;
623  }
624  inline v_int8x16(const v_int8x16& vec) {
625  *pval = *(vec.pval);
626  }
627  schar val[16];
628  vint8m1_t* pval = (vint8m1_t*)val;
629 };
630 
631 struct v_uint16x8
632 {
633  typedef ushort lane_type;
634  enum { nlanes = 8 };
635 
636  v_uint16x8() {}
637  explicit v_uint16x8(vuint16m1_t v)
638  {
639  *pval = v;
640  }
641  v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
642  {
643  ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
644  *pval = vle16_v_u16m1(v, nlanes);
645  }
646  operator vuint16m1_t() const
647  {
648  return *pval;
649  }
650  ushort get0() const
651  {
652  return vmv_x(*pval);
653  }
654 
655  inline v_uint16x8& operator=(const v_uint16x8& vec) {
656  *pval = *(vec.pval);
657  return *this;
658  }
659  inline v_uint16x8(const v_uint16x8& vec) {
660  *pval = *(vec.pval);
661  }
662  ushort val[8];
663  vuint16m1_t* pval = (vuint16m1_t*)val;
664 };
665 
666 struct v_int16x8
667 {
668  typedef short lane_type;
669  enum { nlanes = 8 };
670 
671  v_int16x8() {}
672  explicit v_int16x8(vint16m1_t v)
673  {
674  *pval = v;
675  }
676  v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
677  {
678  short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
679  *pval = vle16_v_i16m1(v, nlanes);
680  }
681  operator vint16m1_t() const
682  {
683  return *pval;
684  }
685  short get0() const
686  {
687  return vmv_x(*pval);
688  }
689 
690  inline v_int16x8& operator=(const v_int16x8& vec) {
691  *pval = *(vec.pval);
692  return *this;
693  }
694  inline v_int16x8(const v_int16x8& vec) {
695  *pval = *(vec.pval);
696  }
697  short val[8];
698  vint16m1_t* pval = (vint16m1_t*)val;
699 };
700 
701 struct v_uint32x4
702 {
703  typedef unsigned lane_type;
704  enum { nlanes = 4 };
705 
706  v_uint32x4() {}
707  explicit v_uint32x4(vuint32m1_t v)
708  {
709  *pval = v;
710  }
711  v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
712  {
713  unsigned v[] = {v0, v1, v2, v3};
714  *pval = vle32_v_u32m1(v, nlanes);
715  }
716  operator vuint32m1_t() const
717  {
718  return *pval;
719  }
720  unsigned get0() const
721  {
722  return vmv_x(*pval);
723  }
724 
725  inline v_uint32x4& operator=(const v_uint32x4& vec) {
726  *pval = *(vec.pval);
727  return *this;
728  }
729  inline v_uint32x4(const v_uint32x4& vec) {
730  *pval = *(vec.pval);
731  }
732  unsigned val[4];
733  vuint32m1_t* pval = (vuint32m1_t*)val;
734 };
735 
736 struct v_int32x4
737 {
738  typedef int lane_type;
739  enum { nlanes = 4 };
740 
741  v_int32x4() {}
742  explicit v_int32x4(vint32m1_t v)
743  {
744  *pval = v;
745  }
746  v_int32x4(int v0, int v1, int v2, int v3)
747  {
748  int v[] = {v0, v1, v2, v3};
749  *pval = vle32_v_i32m1(v, nlanes);
750  }
751  operator vint32m1_t() const
752  {
753  return *pval;
754  }
755  int get0() const
756  {
757  return vmv_x(*pval);
758  }
759 
760  inline v_int32x4& operator=(const v_int32x4& vec) {
761  *pval = *(vec.pval);
762  return *this;
763  }
764  inline v_int32x4(const v_int32x4& vec) {
765  *pval = *(vec.pval);
766  }
767  int val[4];
768  vint32m1_t* pval = (vint32m1_t*)val;
769 };
770 
771 struct v_float32x4
772 {
773  typedef float lane_type;
774  enum { nlanes = 4 };
775 
776  v_float32x4() {}
777  explicit v_float32x4(vfloat32m1_t v)
778  {
779  *pval = v;
780  }
781  v_float32x4(float v0, float v1, float v2, float v3)
782  {
783  float v[] = {v0, v1, v2, v3};
784  *pval = vle32_v_f32m1(v, nlanes);
785  }
786  operator vfloat32m1_t() const
787  {
788  return *pval;
789  }
790  float get0() const
791  {
792  return vfmv_f(*pval);
793  }
794  inline v_float32x4& operator=(const v_float32x4& vec) {
795  *pval = *(vec.pval);
796  return *this;
797  }
798  inline v_float32x4(const v_float32x4& vec) {
799  *pval = *(vec.pval);
800  }
801  float val[4];
802  vfloat32m1_t* pval = (vfloat32m1_t*)val;
803 };
804 
805 struct v_uint64x2
806 {
807  typedef uint64 lane_type;
808  enum { nlanes = 2 };
809 
810  v_uint64x2() {}
811  explicit v_uint64x2(vuint64m1_t v)
812  {
813  *pval = v;
814  }
815  v_uint64x2(uint64 v0, uint64 v1)
816  {
817  uint64 v[] = {v0, v1};
818  *pval = vle64_v_u64m1(v, nlanes);
819  }
820  operator vuint64m1_t() const
821  {
822  return *pval;
823  }
824  uint64 get0() const
825  {
826  return vmv_x(*pval);
827  }
828 
829  inline v_uint64x2& operator=(const v_uint64x2& vec) {
830  *pval = *(vec.pval);
831  return *this;
832  }
833  inline v_uint64x2(const v_uint64x2& vec) {
834  *pval = *(vec.pval);
835  }
836  uint64 val[2];
837  vuint64m1_t* pval = (vuint64m1_t*)val;
838 };
839 
840 struct v_int64x2
841 {
842  typedef int64 lane_type;
843  enum { nlanes = 2 };
844 
845  v_int64x2() {}
846  explicit v_int64x2(vint64m1_t v)
847  {
848  *pval = v;
849  }
850  v_int64x2(int64 v0, int64 v1)
851  {
852  int64 v[] = {v0, v1};
853  *pval = vle64_v_i64m1(v, nlanes);
854  }
855  operator vint64m1_t() const
856  {
857  return *pval;
858  }
859  int64 get0() const
860  {
861  return vmv_x(*pval);
862  }
863 
864  inline v_int64x2& operator=(const v_int64x2& vec) {
865  *pval = *(vec.pval);
866  return *this;
867  }
868  inline v_int64x2(const v_int64x2& vec) {
869  *pval = *(vec.pval);
870  }
871  int64 val[2];
872  vint64m1_t* pval = (vint64m1_t*)val;
873 };
874 
875 #if CV_SIMD128_64F
876 struct v_float64x2
877 {
878  typedef double lane_type;
879  enum { nlanes = 2 };
880 
881  v_float64x2() {}
882  explicit v_float64x2(vfloat64m1_t v)
883  {
884  *pval = v;
885  }
886  v_float64x2(double v0, double v1)
887  {
888  double v[] = {v0, v1};
889  *pval = vle64_v_f64m1(v, nlanes);
890  }
891  operator vfloat64m1_t() const
892  {
893  return *pval;
894  }
895  double get0() const
896  {
897  return vfmv_f(*pval);
898  }
899 
900  inline v_float64x2& operator=(const v_float64x2& vec) {
901  *pval = *(vec.pval);
902  return *this;
903  }
904  inline v_float64x2(const v_float64x2& vec) {
905  *pval = *(vec.pval);
906  }
907  double val[2];
908  vfloat64m1_t* pval = (vfloat64m1_t*)val;
909 };
910 #endif // CV_SIMD128_64F
911 #endif // __clang__
912 
914 
915 #define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
916 inline v_##_Tpvec v_setzero_##suffix1() \
917 { \
918  return v_##_Tpvec(vmv_v_x_##suffix2##m1(0, vl)); \
919 } \
920 inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
921 { \
922  return v_##_Tpvec(vmv_v_x_##suffix2##m1(v, vl)); \
923 }
924 
925 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, u8, u8, 16)
926 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, s8, i8, 16)
927 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, u16, u16, 8)
928 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, s16, i16, 8)
929 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, u32, u32, 4)
930 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, s32, i32, 4)
931 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, u64, u64, 2)
932 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, s64, i64, 2)
933 
934 #define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
935 inline v_##_Tpv v_setzero_##suffix() \
936 { \
937  return v_##_Tpv(vfmv_v_f_##suffix##m1(0, vl)); \
938 } \
939 inline v_##_Tpv v_setall_##suffix(_Tp v) \
940 { \
941  return v_##_Tpv(vfmv_v_f_##suffix##m1(v, vl)); \
942 }
943 
944 OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, f32, 4)
945 #if CV_SIMD128_64F
946 OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, f64, 2)
947 #endif
948 
950 
951 #define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
952 inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
953 
954 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
955 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
956 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
957 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
958 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
959 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
960 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
961 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
962 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
963 #if CV_SIMD128_64F
964 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
965 #endif
966 
967 #define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
968 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
969 { \
970  return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
971 } \
972 inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
973 { \
974  return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
975 }
976 
977 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, int8x16, u8, s8, u8, i8)
978 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, int16x8, u16, s16, u16, i16)
979 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, int32x4, u32, s32, u32, i32)
980 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, float32x4, u32, f32, u32, f32)
981 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, float32x4, s32, f32, i32, f32)
982 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, int64x2, u64, s64, u64, i64)
983 #if CV_SIMD128_64F
984 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, float64x2, u64, f64, u64, f64)
985 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64x2, float64x2, s64, f64, i64, f64)
986 #endif
987 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint16x8, u8, u16, u8, u16)
988 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint32x4, u8, u32, u8, u32)
989 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint64x2, u8, u64, u8, u64)
990 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint32x4, u16, u32, u16, u32)
991 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint64x2, u16, u64, u16, u64)
992 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, uint64x2, u32, u64, u32, u64)
993 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int16x8, s8, s16, i8, i16)
994 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int32x4, s8, s32, i8, i32)
995 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int64x2, s8, s64, i8, i64)
996 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int32x4, s16, s32, i16, i32)
997 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int64x2, s16, s64, i16, i64)
998 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, int64x2, s32, s64, i32, i64)
999 
1000 
1001 #define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
1002 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
1003 { \
1004  return v_##_Tpvec1(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v)));\
1005 } \
1006 inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
1007 { \
1008  return v_##_Tpvec2(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v)));\
1009 }
1010 
1011 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int16x8, u8, s16, u, i, 8, 16)
1012 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int32x4, u8, s32, u, i, 8, 32)
1013 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int64x2, u8, s64, u, i, 8, 64)
1014 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int8x16, u16, s8, u, i, 16, 8)
1015 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int32x4, u16, s32, u, i, 16, 32)
1016 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int64x2, u16, s64, u, i, 16, 64)
1017 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int8x16, u32, s8, u, i, 32, 8)
1018 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int16x8, u32, s16, u, i, 32, 16)
1019 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int64x2, u32, s64, u, i, 32, 64)
1020 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int8x16, u64, s8, u, i, 64, 8)
1021 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int16x8, u64, s16, u, i, 64, 16)
1022 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int32x4, u64, s32, u, i, 64, 32)
1023 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float32x4, u8, f32, u, f, 8, 32)
1024 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float32x4, u16, f32, u, f, 16, 32)
1025 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, float32x4, u64, f32, u, f, 64, 32)
1026 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float32x4, s8, f32, i, f, 8, 32)
1027 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float32x4, s16, f32, i, f, 16, 32)
1028 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64x2, float32x4, s64, f32, i, f, 64, 32)
1029 #if CV_SIMD128_64F
1030 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float64x2, u8, f64, u, f, 8, 64)
1031 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float64x2, u16, f64, u, f, 16, 64)
1032 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, float64x2, u32, f64, u, f, 32, 64)
1033 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float64x2, s8, f64, i, f, 8, 64)
1034 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float64x2, s16, f64, i, f, 16, 64)
1035 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32x4, float64x2, s32, f64, i, f, 32, 64)
1036 #endif
1037 
1038 // Three times reinterpret
1039 #if CV_SIMD128_64F
1040 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) \
1041 { \
1042  return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v))));\
1043 } \
1044 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) \
1045 { \
1046  return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v))));\
1047 }
1048 #endif
1049 
1051 
1052 #define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vmv, vl) \
1053 template <int s> \
1054 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1055 { \
1056  return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
1057 } \
1058 template<int i> inline _Tp v_extract_n(_Tpvec v) \
1059 { \
1060  return _Tp(vmv(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), v, i, vl))); \
1061 }
1062 
1063 
1064 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8x16, uchar, u8, vmv_x_s_u8m1_u8, 16)
1065 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8x16, schar, i8, vmv_x_s_i8m1_i8, 16)
1066 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16x8, ushort, u16, vmv_x_s_u16m1_u16, 8)
1067 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16x8, short, i16, vmv_x_s_i16m1_i16, 8)
1068 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32x4, uint, u32, vmv_x_s_u32m1_u32, 4)
1069 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32x4, int, i32, vmv_x_s_i32m1_i32, 4)
1070 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64x2, uint64, u64, vmv_x_s_u64m1_u64, 2)
1071 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64x2, int64, i64, vmv_x_s_i64m1_i64, 2)
1072 
1073 #define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vmv, vl) \
1074 template <int s> \
1075 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1076 { \
1077  return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
1078 } \
1079 template<int i> inline _Tp v_extract_n(_Tpvec v) \
1080 { \
1081  return _Tp(vmv(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), v, i, vl))); \
1082 }
1083 
1084 OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32x4, float, f32, vfmv_f_s_f32m1_f32, 4)
1085 #if CV_SIMD128_64F
1086 OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64x2, double, f64, vfmv_f_s_f64m1_f64, 2)
1087 #endif
1088 
1090 
1091 #define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
1092 inline _Tpvec v_load(const _Tp* ptr) \
1093 { \
1094  return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
1095 } \
1096 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1097 { \
1098  return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
1099 } \
1100 inline _Tpvec v_load_low(const _Tp* ptr) \
1101 { \
1102  _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
1103  return res; \
1104 } \
1105 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1106 { \
1107  vse##width##_v_##suffix##m1(ptr, a, vl); \
1108 } \
1109 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1110 { \
1111  vse##width##_v_##suffix##m1(ptr, a, vl); \
1112 } \
1113 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1114 { \
1115  vse##width##_v_##suffix##m1(ptr, a, vl); \
1116 } \
1117 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1118 { \
1119  vse##width##_v_##suffix##m1(ptr, a, vl); \
1120 } \
1121 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1122 { \
1123  vse##width##_v_##suffix##m1(ptr, a, hvl); \
1124 } \
1125 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1126 { \
1127  vse##width##_v_##suffix##m1(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
1128 }
1129 
1130 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 16, 8, u8, vmv_v_x_u8m1)
1131 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 16, 8, i8, vmv_v_x_i8m1)
1132 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 8, 16, u16, vmv_v_x_u16m1)
1133 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 8, 16, i16, vmv_v_x_i16m1)
1134 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 4, 32, u32, vmv_v_x_u32m1)
1135 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 4, 32, i32, vmv_v_x_i32m1)
1136 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 2, 64, u64, vmv_v_x_u64m1)
1137 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 2, 64, i64, vmv_v_x_i64m1)
1138 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 4, 32, f32, vfmv_v_f_f32m1)
1139 #if CV_SIMD128_64F
1140 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 2, 64, f64, vfmv_v_f_f64m1)
1141 #endif
1142 
1143 inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
1144 {
1145  schar elems[16] =
1146  {
1147  ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
1148  ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
1149  };
1150  return v_int8x16(vle8_v_i8m1(elems, 16));
1151 }
1152 inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); }
1153 
1154 inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
1155 {
1156  short elems[8] =
1157  {
1158  ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
1159  };
1160  return v_int16x8(vle16_v_i16m1(elems, 8));
1161 }
1162 inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); }
1163 
1164 inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
1165 {
1166  int elems[4] =
1167  {
1168  ptr0[0], ptr0[1], ptr1[0], ptr1[1]
1169  };
1170  return v_int32x4(vle32_v_i32m1(elems, 4));
1171 }
1172 inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
1173 {
1174  float elems[4] =
1175  {
1176  ptr0[0], ptr0[1], ptr1[0], ptr1[1]
1177  };
1178  return v_float32x4(vle32_v_f32m1(elems, 4));
1179 }
1180 inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); }
1181 
1182 inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
1183 {
1184  int64 elems[2] =
1185  {
1186  ptr0[0], ptr1[0]
1187  };
1188  return v_int64x2(vle64_v_i64m1(elems, 2));
1189 }
1190 inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); }
1191 
1192 #if CV_SIMD128_64F
1193 inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
1194 {
1195  double elems[2] =
1196  {
1197  ptr0[0], ptr1[0]
1198  };
1199  return v_float64x2(vle64_v_f64m1(elems, 2));
1200 }
1201 #endif
1202 
1203 
1205 
1206 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1207 {
1208  schar elems[16] =
1209  {
1210  tab[idx[ 0]],
1211  tab[idx[ 1]],
1212  tab[idx[ 2]],
1213  tab[idx[ 3]],
1214  tab[idx[ 4]],
1215  tab[idx[ 5]],
1216  tab[idx[ 6]],
1217  tab[idx[ 7]],
1218  tab[idx[ 8]],
1219  tab[idx[ 9]],
1220  tab[idx[10]],
1221  tab[idx[11]],
1222  tab[idx[12]],
1223  tab[idx[13]],
1224  tab[idx[14]],
1225  tab[idx[15]]
1226  };
1227  return v_int8x16(vle8_v_i8m1(elems, 16));
1228 }
1229 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1230 {
1231  schar elems[16] =
1232  {
1233  tab[idx[0]],
1234  tab[idx[0] + 1],
1235  tab[idx[1]],
1236  tab[idx[1] + 1],
1237  tab[idx[2]],
1238  tab[idx[2] + 1],
1239  tab[idx[3]],
1240  tab[idx[3] + 1],
1241  tab[idx[4]],
1242  tab[idx[4] + 1],
1243  tab[idx[5]],
1244  tab[idx[5] + 1],
1245  tab[idx[6]],
1246  tab[idx[6] + 1],
1247  tab[idx[7]],
1248  tab[idx[7] + 1]
1249  };
1250  return v_int8x16(vle8_v_i8m1(elems, 16));
1251 }
1252 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1253 {
1254  schar elems[16] =
1255  {
1256  tab[idx[0]],
1257  tab[idx[0] + 1],
1258  tab[idx[0] + 2],
1259  tab[idx[0] + 3],
1260  tab[idx[1]],
1261  tab[idx[1] + 1],
1262  tab[idx[1] + 2],
1263  tab[idx[1] + 3],
1264  tab[idx[2]],
1265  tab[idx[2] + 1],
1266  tab[idx[2] + 2],
1267  tab[idx[2] + 3],
1268  tab[idx[3]],
1269  tab[idx[3] + 1],
1270  tab[idx[3] + 2],
1271  tab[idx[3] + 3]
1272  };
1273  return v_int8x16(vle8_v_i8m1(elems, 16));
1274 }
1275 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1276 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1277 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1278 
1279 inline v_int16x8 v_lut(const short* tab, const int* idx)
1280 {
1281  short elems[8] =
1282  {
1283  tab[idx[0]],
1284  tab[idx[1]],
1285  tab[idx[2]],
1286  tab[idx[3]],
1287  tab[idx[4]],
1288  tab[idx[5]],
1289  tab[idx[6]],
1290  tab[idx[7]]
1291  };
1292  return v_int16x8(vle16_v_i16m1(elems, 8));
1293 }
1294 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1295 {
1296  short elems[8] =
1297  {
1298  tab[idx[0]],
1299  tab[idx[0] + 1],
1300  tab[idx[1]],
1301  tab[idx[1] + 1],
1302  tab[idx[2]],
1303  tab[idx[2] + 1],
1304  tab[idx[3]],
1305  tab[idx[3] + 1]
1306  };
1307  return v_int16x8(vle16_v_i16m1(elems, 8));
1308 }
1309 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1310 {
1311  short elems[8] =
1312  {
1313  tab[idx[0]],
1314  tab[idx[0] + 1],
1315  tab[idx[0] + 2],
1316  tab[idx[0] + 3],
1317  tab[idx[1]],
1318  tab[idx[1] + 1],
1319  tab[idx[1] + 2],
1320  tab[idx[1] + 3]
1321  };
1322  return v_int16x8(vle16_v_i16m1(elems, 8));
1323 }
1324 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
1325 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
1326 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
1327 
1328 inline v_int32x4 v_lut(const int* tab, const int* idx)
1329 {
1330  int elems[4] =
1331  {
1332  tab[idx[0]],
1333  tab[idx[1]],
1334  tab[idx[2]],
1335  tab[idx[3]]
1336  };
1337  return v_int32x4(vle32_v_i32m1(elems, 4));
1338 }
1339 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1340 {
1341  int elems[4] =
1342  {
1343  tab[idx[0]],
1344  tab[idx[0] + 1],
1345  tab[idx[1]],
1346  tab[idx[1] + 1]
1347  };
1348  return v_int32x4(vle32_v_i32m1(elems, 4));
1349 }
1350 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1351 {
1352  return v_int32x4(vle32_v_i32m1(tab + idx[0], 4));
1353 }
1354 
1355 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
1356 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
1357 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1358 
1359 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1360 {
1361  int64_t elems[2] =
1362  {
1363  tab[idx[0]],
1364  tab[idx[1]]
1365  };
1366  return v_int64x2(vle64_v_i64m1(elems, 2));
1367 }
1368 inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx)
1369 {
1370  return v_int64x2(vle64_v_i64m1(tab + idx[0], 2));
1371 }
1372 inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1373 inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1374 
1375 inline v_float32x4 v_lut(const float* tab, const int* idx)
1376 {
1377  float elems[4] =
1378  {
1379  tab[idx[0]],
1380  tab[idx[1]],
1381  tab[idx[2]],
1382  tab[idx[3]]
1383  };
1384  return v_float32x4(vle32_v_f32m1(elems, 4));
1385 }
1386 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1387 {
1388  float elems[4] =
1389  {
1390  tab[idx[0]],
1391  tab[idx[0] + 1],
1392  tab[idx[1]],
1393  tab[idx[1] + 1]
1394  };
1395  return v_float32x4(vle32_v_f32m1(elems, 4));
1396 }
1397 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1398 {
1399  return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
1400 }
1401 
1402 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1403 {
1404  int elems[4] =
1405  {
1406  tab[v_extract_n<0>(idxvec)],
1407  tab[v_extract_n<1>(idxvec)],
1408  tab[v_extract_n<2>(idxvec)],
1409  tab[v_extract_n<3>(idxvec)]
1410  };
1411  return v_int32x4(vle32_v_i32m1(elems, 4));
1412 }
1413 
1414 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1415 {
1416  unsigned elems[4] =
1417  {
1418  tab[v_extract_n<0>(idxvec)],
1419  tab[v_extract_n<1>(idxvec)],
1420  tab[v_extract_n<2>(idxvec)],
1421  tab[v_extract_n<3>(idxvec)]
1422  };
1423  return v_uint32x4(vle32_v_u32m1(elems, 4));
1424 }
1425 
1426 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1427 {
1428  float elems[4] =
1429  {
1430  tab[v_extract_n<0>(idxvec)],
1431  tab[v_extract_n<1>(idxvec)],
1432  tab[v_extract_n<2>(idxvec)],
1433  tab[v_extract_n<3>(idxvec)]
1434  };
1435  return v_float32x4(vle32_v_f32m1(elems, 4));
1436 }
1437 
1438 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1439 {
1440  int idx[4];
1441  v_store_aligned(idx, idxvec);
1442 
1443  x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1444  y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
1445 }
1446 
1447 #if CV_SIMD128_64F
1448 inline v_float64x2 v_lut(const double* tab, const int* idx)
1449 {
1450  double elems[2] =
1451  {
1452  tab[idx[0]],
1453  tab[idx[1]]
1454  };
1455  return v_float64x2(vle64_v_f64m1(elems, 2));
1456 }
1457 
1458 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1459 {
1460  return v_float64x2(vle64_v_f64m1(tab + idx[0], 2));
1461 }
1462 
1463 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1464 {
1465  double elems[2] =
1466  {
1467  tab[v_extract_n<0>(idxvec)],
1468  tab[v_extract_n<1>(idxvec)]
1469  };
1470  return v_float64x2(vle64_v_f64m1(elems, 2));
1471 }
1472 
1473 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1474 {
1475  int idx[4] = {0};
1476  v_store_aligned(idx, idxvec);
1477 
1478  x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1479  y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1480 }
1481 #endif
1482 
1484 
1485 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1486 {
1487  ushort ptr[16] = {0};
1488  v_store(ptr, a);
1489  v_store(ptr + 8, b);
1490  return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr, 16), 0, 16));
1491 }
1492 
1493 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1494  const v_uint32x4& c, const v_uint32x4& d)
1495 {
1496  unsigned ptr[16] = {0};
1497  v_store(ptr, a);
1498  v_store(ptr + 4, b);
1499  v_store(ptr + 8, c);
1500  v_store(ptr + 12, d);
1501  return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr, 16), 0, 16), 0, 16));
1502 }
1503 
1504 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
1505  const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
1506  const v_uint64x2& g, const v_uint64x2& h)
1507 {
1508  uint64 ptr[16] = {0};
1509  v_store(ptr, a);
1510  v_store(ptr + 2, b);
1511  v_store(ptr + 4, c);
1512  v_store(ptr + 6, d);
1513  v_store(ptr + 8, e);
1514  v_store(ptr + 10, f);
1515  v_store(ptr + 12, g);
1516  v_store(ptr + 14, h);
1517  return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr, 16), 0, 16), 0, 16), 0, 16));
1518 }
1519 
1521 #define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, vl) \
1522 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1523 { \
1524  return _Tpvec(intrin(a, b, vl)); \
1525 } \
1526 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
1527 { \
1528  a = _Tpvec(intrin(a, b, vl)); \
1529  return a; \
1530 }
1531 
1532 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 16)
1533 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 16)
1534 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 16)
1535 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 16)
1536 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 16)
1537 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 16)
1538 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 8)
1539 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 8)
1540 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 8)
1541 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 8)
1542 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 8)
1543 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 8)
1544 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 4)
1545 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 4)
1546 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 4)
1547 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 4)
1548 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 4)
1549 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 4)
1550 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 4)
1551 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 4)
1552 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 4)
1553 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 4)
1554 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 4)
1555 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 4)
1556 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 2)
1557 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 2)
1558 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 2)
1559 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 2)
1560 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 2)
1561 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 2)
1562 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 2)
1563 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 2)
1564 #if CV_SIMD128_64F
1565 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 2)
1566 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 2)
1567 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 2)
1568 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 2)
1569 #endif
1570 
1571 
1573 
1574 #define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, vl) \
1575 OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, vl) \
1576 OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, vl) \
1577 OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, vl) \
1578 inline _Tpvec operator ~ (const _Tpvec& a) \
1579 { \
1580  return _Tpvec(vnot_v_##suffix##m1(a, vl)); \
1581 }
1582 
1583 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 16)
1584 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 16)
1585 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 8)
1586 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 8)
1587 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 4)
1588 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 4)
1589 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 2)
1590 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 2)
1591 
1592 #define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
1593 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
1594 { \
1595  return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
1596 } \
1597 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
1598 { \
1599  a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
1600  return a; \
1601 }
1602 
1603 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
1604 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
1605 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
1606 
1607 inline v_float32x4 operator ~ (const v_float32x4& a)
1608 {
1609  return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a), 4)));
1610 }
1611 
1612 #if CV_SIMD128_64F
1613 #define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
1614 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
1615 { \
1616  return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
1617 } \
1618 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
1619 { \
1620  a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
1621  return a; \
1622 }
1623 
1624 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
1625 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
1626 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
1627 
1628 inline v_float64x2 operator ~ (const v_float64x2& a)
1629 {
1630  return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a), 2)));
1631 }
1632 #endif
1633 
1635 
1636 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
1637 inline _Tpvec operator << (const _Tpvec& a, int n) \
1638 { \
1639  return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1640 } \
1641 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1642 { \
1643  return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
1644 } \
1645 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1646 { \
1647  return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1648 } \
1649 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1650 { \
1651  return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
1652 }
1653 
1654 #define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
1655 inline _Tpvec operator << (const _Tpvec& a, int n) \
1656 { \
1657  return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1658 } \
1659 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1660 { \
1661  return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
1662 } \
1663 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1664 { \
1665  return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1666 } \
1667 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1668 { \
1669  return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
1670 }
1671 
1672 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 16)
1673 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 8)
1674 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 4)
1675 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 2)
1676 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 16)
1677 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 8)
1678 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 4)
1679 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 2)
1680 
1681 
1683 
1684 #define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
1685 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1686 { \
1687  uint64_t ones = -1; \
1688  return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl)); \
1689 }
1690 
1691 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
1692 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1693 { \
1694  union { uint64 u; double d; } ones; ones.u = -1; \
1695  return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
1696 }
1697 
1698 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width, vl) \
1699 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
1700 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
1701 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, vl) \
1702 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, vl) \
1703 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, vl) \
1704 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, vl)
1705 
1706 #define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width, vl) \
1707 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
1708 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
1709 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, vl) \
1710 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, vl) \
1711 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, vl) \
1712 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, vl)
1713 
1714 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width, vl) \
1715 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, vl) \
1716 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, vl) \
1717 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, vl) \
1718 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, vl) \
1719 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, vl) \
1720 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, vl)
1721 
1722 
1723 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8, 16)
1724 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16, 8)
1725 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32, 4)
1726 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64, 2)
1727 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8, 16)
1728 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16, 8)
1729 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32, 4)
1730 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64, 2)
1731 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32, 4)
1732 #if CV_SIMD128_64F
1733 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64, 2)
1734 #endif
1735 
1736 inline v_float32x4 v_not_nan(const v_float32x4& a)
1737 { return a == a; }
1738 
1739 #if CV_SIMD128_64F
1740 inline v_float64x2 v_not_nan(const v_float64x2& a)
1741 { return a == a; }
1742 #endif
1743 
1745 
1746 #define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
1747 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1748 { \
1749  return _Tpvec(intrin(a, b, vl)); \
1750 }
1751 
1752 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
1753 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
1754 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
1755 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
1756 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
1757 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
1758 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
1759 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
1760 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
1761 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
1762 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
1763 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
1764 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
1765 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
1766 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 2)
1767 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 2)
1768 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 2)
1769 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 2)
1770 #if CV_SIMD128_64F
1771 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
1772 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
1773 #endif
1774 
1776 
1777 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
1778 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
1779 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
1780 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
1781 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
1782 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
1783 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
1784 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
1785 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
1786 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
1787 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
1788 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
1789 
1791 
1792 #define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
1793 inline scalartype v_reduce_sum(const _Tpvec& a) \
1794 { \
1795  _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
1796  _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
1797  res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
1798  return (scalartype)(_wTpvec(res).get0()); \
1799 }
1800 
1801 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu)
1802 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum)
1803 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 8, wredsumu)
1804 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 8, wredsum)
1805 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 4, wredsumu)
1806 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 4, wredsum)
1807 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 2, redsum)
1808 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 2, redsum)
1809 
1810 #define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
1811 inline scalartype v_reduce_sum(const _Tpvec& a) \
1812 { \
1813  _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
1814  _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
1815  res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
1816  return (scalartype)(_wTpvec(res).get0()); \
1817 }
1818 
1819 // vfredsum for float has renamed to fredosum, also updated in GNU.
1820 OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 4, fredosum)
1821 #if CV_SIMD128_64F
1822 OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 2, fredosum)
1823 #endif
1824 
1825 
1826 #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
1827 inline scalartype v_reduce_##func(const _Tpvec& a) \
1828 { \
1829  _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a, vl)); \
1830  return scalartype(res.get0()); \
1831 }
1832 
1833 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 16, redminu)
1834 OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 16, redmin)
1835 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 8, redminu)
1836 OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 8, redmin)
1837 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 4, redminu)
1838 OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 4, redmin)
1839 OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 4, fredmin)
1840 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 16, redmaxu)
1841 OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 16, redmax)
1842 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 8, redmaxu)
1843 OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 8, redmax)
1844 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 4, redmaxu)
1845 OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 4, redmax)
1846 OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 4, fredmax)
1847 
1848 
1849 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1850  const v_float32x4& c, const v_float32x4& d)
1851 {
1852  float elems[4] =
1853  {
1854  v_reduce_sum(a),
1855  v_reduce_sum(b),
1856  v_reduce_sum(c),
1857  v_reduce_sum(d)
1858  };
1859  return v_float32x4(vle32_v_f32m1(elems, 4));
1860 }
1861 
1863 
1864 inline v_float32x4 v_sqrt(const v_float32x4& x)
1865 {
1866  return v_float32x4(vfsqrt_v_f32m1(x, 4));
1867 }
1868 
1869 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1870 {
1871  v_float32x4 one = v_setall_f32(1.0f);
1872  return one / v_sqrt(x);
1873 }
1874 
1875 #if CV_SIMD128_64F
1876 inline v_float64x2 v_sqrt(const v_float64x2& x)
1877 {
1878  return v_float64x2(vfsqrt_v_f64m1(x, 4));
1879 }
1880 
1881 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1882 {
1883  v_float64x2 one = v_setall_f64(1.0f);
1884  return one / v_sqrt(x);
1885 }
1886 #endif
1887 
1888 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1889 {
1890  v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
1891  return v_sqrt(x);
1892 }
1893 
1894 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1895 {
1896  return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
1897 }
1898 
1899 #if CV_SIMD128_64F
1900 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1901 {
1902  v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
1903  return v_sqrt(x);
1904 }
1905 
1906 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1907 {
1908  return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
1909 }
1910 #endif
1911 
1913 
1914 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1915 {
1916  return v_float32x4(vfmacc_vv_f32m1(c, a, b, 4));
1917 }
1918 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1919 {
1920  return v_int32x4(vmacc_vv_i32m1(c, a, b, 4));
1921 }
1922 
1923 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1924 {
1925  return v_fma(a, b, c);
1926 }
1927 
1928 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1929 {
1930  return v_fma(a, b, c);
1931 }
1932 
1933 #if CV_SIMD128_64F
1934 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1935 {
1936  return v_float64x2(vfmacc_vv_f64m1(c, a, b, 2));
1937 }
1938 
1939 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1940 {
1941  return v_fma(a, b, c);
1942 }
1943 #endif
1944 
1946 
1947 // use overloaded vcpop in clang, no casting like (vuint64m1_t) is needed.
1948 #ifndef __clang__
1949 #define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
1950 inline bool v_check_all(const _Tpvec& a) \
1951 { \
1952  auto v0 = vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl); \
1953  v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
1954  return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) == 0; \
1955 } \
1956 inline bool v_check_any(const _Tpvec& a) \
1957 { \
1958  auto v0 = vsrl_vx_##suffix##m1(a, shift, vl); \
1959  v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
1960  return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) != 0; \
1961 }
1962 
1963 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 16)
1964 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 8)
1965 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 4)
1966 //OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 2)
1967 inline bool v_check_all(const v_uint64x2& a)
1968 {
1969  v_uint64x2 v = v_uint64x2(vsrl_vx_u64m1(vnot_v_u64m1(a, 2), 63, 2));
1970  return (v.val[0] | v.val[1]) == 0;
1971 }
1972 inline bool v_check_any(const v_uint64x2& a)
1973 {
1974  v_uint64x2 v = v_uint64x2(vsrl_vx_u64m1(a, 63, 2));
1975  return (v.val[0] | v.val[1]) != 0;
1976 }
1977 
1978 inline bool v_check_all(const v_int8x16& a)
1979 { return v_check_all(v_reinterpret_as_u8(a)); }
1980 inline bool v_check_any(const v_int8x16& a)
1981 { return v_check_any(v_reinterpret_as_u8(a)); }
1982 
1983 inline bool v_check_all(const v_int16x8& a)
1984 { return v_check_all(v_reinterpret_as_u16(a)); }
1985 inline bool v_check_any(const v_int16x8& a)
1986 { return v_check_any(v_reinterpret_as_u16(a)); }
1987 
1988 inline bool v_check_all(const v_int32x4& a)
1989 { return v_check_all(v_reinterpret_as_u32(a)); }
1990 inline bool v_check_any(const v_int32x4& a)
1991 { return v_check_any(v_reinterpret_as_u32(a)); }
1992 
1993 inline bool v_check_all(const v_float32x4& a)
1994 { return v_check_all(v_reinterpret_as_u32(a)); }
1995 inline bool v_check_any(const v_float32x4& a)
1996 { return v_check_any(v_reinterpret_as_u32(a)); }
1997 
1998 inline bool v_check_all(const v_int64x2& a)
1999 { return v_check_all(v_reinterpret_as_u64(a)); }
2000 inline bool v_check_any(const v_int64x2& a)
2001 { return v_check_any(v_reinterpret_as_u64(a)); }
2002 
2003 #if CV_SIMD128_64F
2004 inline bool v_check_all(const v_float64x2& a)
2005 { return v_check_all(v_reinterpret_as_u64(a)); }
2006 inline bool v_check_any(const v_float64x2& a)
2007 { return v_check_any(v_reinterpret_as_u64(a)); }
2008 #endif
2009 #else
2010 #define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
2011 inline bool v_check_all(const _Tpvec& a) \
2012 { \
2013  return vcpop(vmslt(a, 0, vl), vl) == vl; \
2014 } \
2015 inline bool v_check_any(const _Tpvec& a) \
2016 { \
2017  return vcpop(vmslt(a, 0, vl), vl) != 0; \
2018 }
2019 
2020 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8x16, 16)
2021 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16x8, 8)
2022 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32x4, 4)
2023 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64x2, 2)
2024 
2025 
2026 inline bool v_check_all(const v_uint8x16& a)
2027 { return v_check_all(v_reinterpret_as_s8(a)); }
2028 inline bool v_check_any(const v_uint8x16& a)
2029 { return v_check_any(v_reinterpret_as_s8(a)); }
2030 
2031 inline bool v_check_all(const v_uint16x8& a)
2032 { return v_check_all(v_reinterpret_as_s16(a)); }
2033 inline bool v_check_any(const v_uint16x8& a)
2034 { return v_check_any(v_reinterpret_as_s16(a)); }
2035 
2036 inline bool v_check_all(const v_uint32x4& a)
2037 { return v_check_all(v_reinterpret_as_s32(a)); }
2038 inline bool v_check_any(const v_uint32x4& a)
2039 { return v_check_any(v_reinterpret_as_s32(a)); }
2040 
2041 inline bool v_check_all(const v_float32x4& a)
2042 { return v_check_all(v_reinterpret_as_s32(a)); }
2043 inline bool v_check_any(const v_float32x4& a)
2044 { return v_check_any(v_reinterpret_as_s32(a)); }
2045 
2046 inline bool v_check_all(const v_uint64x2& a)
2047 { return v_check_all(v_reinterpret_as_s64(a)); }
2048 inline bool v_check_any(const v_uint64x2& a)
2049 { return v_check_any(v_reinterpret_as_s64(a)); }
2050 
2051 #if CV_SIMD128_64F
2052 inline bool v_check_all(const v_float64x2& a)
2053 { return v_check_all(v_reinterpret_as_s64(a)); }
2054 inline bool v_check_any(const v_float64x2& a)
2055 { return v_check_any(v_reinterpret_as_s64(a)); }
2056 #endif
2057 #endif
2059 
2060 #define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
2061 inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
2062 { \
2063  return v_max(a, b) - v_min(a, b); \
2064 }
2065 
2066 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16, absdiff)
2067 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16x8, absdiff)
2068 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32x4, absdiff)
2069 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32x4, absdiff)
2070 #if CV_SIMD128_64F
2071 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
2072 #endif
2073 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
2074 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
2075 
2076 #define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(ivec, uvec, itype, utype, isuf, usuf, vlen) \
2077 inline uvec v_absdiff(const ivec& a, const ivec& b) \
2078 { \
2079  itype max = vmax_vv_##isuf(a, b, vlen); \
2080  itype min = vmin_vv_##isuf(a, b, vlen); \
2081  return uvec(vreinterpret_v_##isuf##_##usuf(vsub_vv_##isuf(max, min, vlen))); \
2082 }
2083 
2084 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vint8m1_t, vuint8m1_t, i8m1, u8m1, 16)
2085 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vint16m1_t, vuint16m1_t, i16m1, u16m1, 8)
2086 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vint32m1_t, vuint32m1_t, i32m1, u32m1, 4)
2087 
2088 #define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
2089 inline _Tprvec v_abs(const _Tpvec& a) \
2090 { \
2091  return v_absdiff(a, v_setzero_##suffix()); \
2092 }
2093 
2094 OPENCV_HAL_IMPL_RVV_ABS(v_uint8x16, v_int8x16, s8)
2095 OPENCV_HAL_IMPL_RVV_ABS(v_uint16x8, v_int16x8, s16)
2096 OPENCV_HAL_IMPL_RVV_ABS(v_uint32x4, v_int32x4, s32)
2097 OPENCV_HAL_IMPL_RVV_ABS(v_float32x4, v_float32x4, f32)
2098 #if CV_SIMD128_64F
2099 OPENCV_HAL_IMPL_RVV_ABS(v_float64x2, v_float64x2, f64)
2100 #endif
2101 
2102 
2103 #define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
2104 inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
2105 { \
2106  return v_reduce_sum(v_absdiff(a, b)); \
2107 }
2108 
2109 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8x16, unsigned)
2110 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8x16, unsigned)
2111 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16x8, unsigned)
2112 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16x8, unsigned)
2113 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32x4, unsigned)
2114 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32x4, unsigned)
2115 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float)
2116 
2117 
2119 #define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, vl) \
2120 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
2121 { \
2122  return _Tpvec(merge(ne(mask, 0, vl), b, a, vl)); \
2123 }
2124 
2125 OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 16)
2126 OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 16)
2127 OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 8)
2128 OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 8)
2129 OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 4)
2130 OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 4)
2131 OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 4)
2132 #if CV_SIMD128_64F
2133 OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 2)
2134 #endif
2135 
2137 
2138 #define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
2139 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
2140 { \
2141  return _Tpvec(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
2142 } \
2143 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
2144 { \
2145  return _Tpvec(vslideup_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
2146 } \
2147 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
2148 { return a; } \
2149 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
2150 { \
2151  return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
2152 } \
2153 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
2154 { \
2155  return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
2156 } \
2157 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
2158 { CV_UNUSED(b); return a; }
2159 
2160 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8x16, u8, 16)
2161 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8x16, i8, 16)
2162 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16x8, u16, 8)
2163 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16x8, i16, 8)
2164 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32x4, u32, 4)
2165 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32x4, i32, 4)
2166 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64x2, u64, 2)
2167 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64x2, i64, 2)
2168 
2169 #define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
2170 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
2171 { \
2172  return _Tpvec(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
2173 } \
2174 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
2175 { \
2176  return _Tpvec(vslideup_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
2177 } \
2178 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
2179 { return a; } \
2180 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
2181 { \
2182  return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
2183 } \
2184 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
2185 { \
2186  return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
2187 } \
2188 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
2189 { CV_UNUSED(b); return a; }
2190 
2191 OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32x4, f32, 4)
2192 #if CV_SIMD128_64F
2193 OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64x2, f64, 2)
2194 #endif
2195 
2197 
2198 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2199 {
2200  return v_float32x4(vfcvt_f_x_v_f32m1(a, 4));
2201 }
2202 
2203 #if CV_SIMD128_64F
2204 #ifndef __clang__
2205 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2206 {
2207  double arr[4] = {a.val[0], a.val[1], 0, 0};
2208  vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2209  return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
2210 }
2211 
2212 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2213 {
2214  double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2215  vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2216  return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
2217 }
2218 #else
2219 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2220 {
2221  vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
2222  return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
2223 }
2224 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2225 {
2226  vfloat64m2_t dst = vlmul_ext_v_f64m1_f64m2(a);
2227  return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(dst, 1, b), 4));
2228 }
2229 #endif
2230 
2231 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2232 {
2233  double ptr[4] = {0};
2234  vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
2235  double elems[2] =
2236  {
2237  ptr[0], ptr[1]
2238  };
2239  return v_float64x2(vle64_v_f64m1(elems, 2));
2240 }
2241 
2242 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2243 {
2244  double ptr[4] = {0};
2245  vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
2246  double elems[2] =
2247  {
2248  ptr[2], ptr[3]
2249  };
2250  return v_float64x2(vle64_v_f64m1(elems, 2));
2251 }
2252 
2253 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2254 {
2255  double ptr[4] = {0};
2256  vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
2257  double elems[2] =
2258  {
2259  ptr[0], ptr[1]
2260  };
2261  return v_float64x2(vle64_v_f64m1(elems, 2));
2262 }
2263 
2264 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2265 {
2266  double ptr[4] = {0};
2267  vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
2268  double elems[2] =
2269  {
2270  ptr[2], ptr[3]
2271  };
2272  return v_float64x2(vle64_v_f64m1(elems, 2));
2273 }
2274 
2275 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2276 {
2277  return v_float64x2(vfcvt_f_x_v_f64m1(a, 2));
2278 }
2279 #endif
2280 
2282 
2283 #define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
2284 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
2285 { \
2286  return v_setall_##suffix(v_extract_n<i>(v)); \
2287 }
2288 
2289 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16, u8)
2290 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int8x16, s8)
2291 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint16x8, u16)
2292 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int16x8, s16)
2293 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32x4, u32)
2294 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32x4, s32)
2295 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint64x2, u64)
2296 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int64x2, s64)
2297 OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32x4, f32)
2298 #if CV_SIMD128_64F
2299 OPENCV_HAL_IMPL_RVV_BROADCAST(v_float64x2, f64)
2300 #endif
2301 
2303 
2304 #define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
2305 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2306  const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2307  v_##_Tpvec& b0, v_##_Tpvec& b1, \
2308  v_##_Tpvec& b2, v_##_Tpvec& b3) \
2309 { \
2310  _Tp elems0[4] = \
2311  { \
2312  v_extract_n<0>(a0), \
2313  v_extract_n<0>(a1), \
2314  v_extract_n<0>(a2), \
2315  v_extract_n<0>(a3) \
2316  }; \
2317  b0 = v_load(elems0); \
2318  _Tp elems1[4] = \
2319  { \
2320  v_extract_n<1>(a0), \
2321  v_extract_n<1>(a1), \
2322  v_extract_n<1>(a2), \
2323  v_extract_n<1>(a3) \
2324  }; \
2325  b1 = v_load(elems1); \
2326  _Tp elems2[4] = \
2327  { \
2328  v_extract_n<2>(a0), \
2329  v_extract_n<2>(a1), \
2330  v_extract_n<2>(a2), \
2331  v_extract_n<2>(a3) \
2332  }; \
2333  b2 = v_load(elems2); \
2334  _Tp elems3[4] = \
2335  { \
2336  v_extract_n<3>(a0), \
2337  v_extract_n<3>(a1), \
2338  v_extract_n<3>(a2), \
2339  v_extract_n<3>(a3) \
2340  }; \
2341  b3 = v_load(elems3); \
2342 }
2343 
2344 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4, unsigned, u32)
2345 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4, int, i32)
2346 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
2347 
2348 
2350 #define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, suffix) \
2351 inline _Tpvec v_reverse(const _Tpvec& a) \
2352 { \
2353  _Tp ptr[_Tpvec::nlanes] = {0}; \
2354  _Tp ptra[_Tpvec::nlanes] = {0}; \
2355  v_store(ptra, a); \
2356  for (int i = 0; i < _Tpvec::nlanes; i++) \
2357  { \
2358  ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
2359  } \
2360  return v_load(ptr); \
2361 }
2362 
2363 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, u8)
2364 OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, i8)
2365 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, u16)
2366 OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, i16)
2367 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, u32)
2368 OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, i32)
2369 OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, f32)
2370 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, u64)
2371 OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, i64)
2372 #if CV_SIMD128_64F
2373 OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, f64)
2374 #endif
2375 
2377 
2378 #define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt, vl) \
2379 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2380 { \
2381  _Tp lptr[_Tpvec::nlanes/2] = {0}; \
2382  _Tp hptr[_Tpvec::nlanes/2] = {0}; \
2383  v_store_low(lptr, a); \
2384  v_store_high(hptr, a); \
2385  b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
2386  b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
2387 } \
2388 inline _Tpwvec v_expand_low(const _Tpvec& a) \
2389 { \
2390  _Tp lptr[_Tpvec::nlanes/2] = {0}; \
2391  v_store_low(lptr, a); \
2392  return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
2393 } \
2394 inline _Tpwvec v_expand_high(const _Tpvec& a) \
2395 { \
2396  _Tp hptr[_Tpvec::nlanes/2] = {0}; \
2397  v_store_high(hptr, a); \
2398  return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
2399 } \
2400 inline _Tpwvec v_load_expand(const _Tp* ptr) \
2401 { \
2402  return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr, vl), vl)); \
2403 }
2404 
2405 OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1, 8)
2406 OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1, 8)
2407 OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1, 4)
2408 OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1, 4)
2409 OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1, 2)
2410 OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1, 2)
2411 
2412 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
2413 {
2414  return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr, 4), 4), 4));
2415 }
2416 
2417 inline v_int32x4 v_load_expand_q(const schar* ptr)
2418 {
2419  return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr, 4), 4), 4));
2420 }
2421 
2422 
2423 #define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, shr, hvl, vl) \
2424 inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
2425 { \
2426  _wTp arr[_Tpvec::nlanes] = {0}; \
2427  v_store(arr, a); \
2428  v_store(arr + _wTpvec::nlanes, b); \
2429  return _Tpvec(shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl)); \
2430 } \
2431 inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
2432 { \
2433  _wTp arr[_Tpvec::nlanes] = {0}; \
2434  v_store(arr, a); \
2435  v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2436  vse##hwidth##_v_##hsuffix##m1(ptr, shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl), hvl); \
2437 } \
2438 template<int n> inline \
2439 _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
2440 { \
2441  _wTp arr[_Tpvec::nlanes] = {0}; \
2442  v_store(arr, a); \
2443  v_store(arr + _wTpvec::nlanes, b); \
2444  return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)); \
2445 } \
2446 template<int n> inline \
2447 void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
2448 { \
2449  _wTp arr[_Tpvec::nlanes] = {0}; \
2450  v_store(arr, a); \
2451  v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2452  vse##hwidth##_v_##hsuffix##m1(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)), hvl); \
2453 }
2454 
2455 OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 8, 16, u8, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1, 8, 16)
2456 OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 8, 16, i8, i16, vnclip_wx_i8m1, vnclip_wx_i8m1, 8, 16)
2457 OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 16, 32, u16, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1, 4, 8)
2458 OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 16, 32, i16, i32, vnclip_wx_i16m1, vnclip_wx_i16m1, 4, 8)
2459 OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 32, 64, u32, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1, 2, 4)
2460 OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 32, 64, i32, i64, vnclip_wx_i32m1, vnsra_wx_i32m1, 2, 4)
2461 
2462 
2463 #define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
2464 inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
2465 { \
2466  _wTp arr[_Tpvec::nlanes] = {0}; \
2467  v_store(arr, a); \
2468  v_store(arr + _wTpvec::nlanes, b); \
2469  return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl)); \
2470 } \
2471 inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2472 { \
2473  _wTp arr[_Tpvec::nlanes] = {0}; \
2474  v_store(arr, a); \
2475  v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2476  vse##hwidth##_v_##hsuffix##m1(ptr, rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl), hvl); \
2477 } \
2478 template<int n> inline \
2479 _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
2480 { \
2481  _wTp arr[_Tpvec::nlanes] = {0}; \
2482  v_store(arr, a); \
2483  v_store(arr + _wTpvec::nlanes, b); \
2484  return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl)); \
2485 } \
2486 template<int n> inline \
2487 void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2488 { \
2489  _wTp arr[_Tpvec::nlanes] = {0}; \
2490  v_store(arr, a); \
2491  v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2492  v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl))); \
2493 }
2494 
2495 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, 8, 16)
2496 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, 4, 8)
2497 
2498 
2499 #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, suffix) \
2500 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2501 { \
2502  _Tp ptra0[v_##_Tpvec::nlanes] = {0}; \
2503  _Tp ptra1[v_##_Tpvec::nlanes] = {0}; \
2504  _Tp ptrb0[v_##_Tpvec::nlanes] = {0}; \
2505  _Tp ptrb1[v_##_Tpvec::nlanes] = {0}; \
2506  v_store(ptra0, a0); \
2507  v_store(ptra1, a1); \
2508  int i; \
2509  for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
2510  { \
2511  ptrb0[i*2] = ptra0[i]; \
2512  ptrb0[i*2+1] = ptra1[i]; \
2513  } \
2514  for( ; i < v_##_Tpvec::nlanes; i++ ) \
2515  { \
2516  ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
2517  ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
2518  } \
2519  b0 = v_load(ptrb0); \
2520  b1 = v_load(ptrb1); \
2521 } \
2522 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2523 { \
2524  _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
2525  _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2526  v_store_low(ptra, a); \
2527  v_store_low(ptrb, b); \
2528  return v_load_halves(ptra, ptrb); \
2529 } \
2530 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2531 { \
2532  _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
2533  _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2534  v_store_high(ptra, a); \
2535  v_store_high(ptrb, b); \
2536  return v_load_halves(ptra, ptrb); \
2537 } \
2538 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2539 { \
2540  c = v_combine_low(a, b); \
2541  d = v_combine_high(a, b); \
2542 }
2543 
2544 OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, u8)
2545 OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, i8)
2546 OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, u16)
2547 OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, i16)
2548 OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, u32)
2549 OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, i32)
2550 OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, f32)
2551 #if CV_SIMD128_64F
2552 OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, f64)
2553 #endif
2554 
2555 
2556 #define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp) \
2557 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2558 { \
2559  _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2560  _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2561  int i, i2; \
2562  for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2563  { \
2564  ptra[i] = ptr[i2]; \
2565  ptrb[i] = ptr[i2+1]; \
2566  } \
2567  a = v_load(ptra); \
2568  b = v_load(ptrb); \
2569 } \
2570 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2571 { \
2572  _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2573  _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2574  _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2575  int i, i3; \
2576  for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2577  { \
2578  ptra[i] = ptr[i3]; \
2579  ptrb[i] = ptr[i3+1]; \
2580  ptrc[i] = ptr[i3+2]; \
2581  } \
2582  a = v_load(ptra); \
2583  b = v_load(ptrb); \
2584  c = v_load(ptrc); \
2585 } \
2586 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2587  v_##_Tpvec& c, v_##_Tpvec& d) \
2588 { \
2589  _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2590  _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2591  _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2592  _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
2593  int i, i4; \
2594  for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2595  { \
2596  ptra[i] = ptr[i4]; \
2597  ptrb[i] = ptr[i4+1]; \
2598  ptrc[i] = ptr[i4+2]; \
2599  ptrd[i] = ptr[i4+3]; \
2600  } \
2601  a = v_load(ptra); \
2602  b = v_load(ptrb); \
2603  c = v_load(ptrc); \
2604  d = v_load(ptrd); \
2605 } \
2606 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2607  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2608 { \
2609  int i, i2; \
2610  _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2611  _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2612  v_store(ptra, a); \
2613  v_store(ptrb, b); \
2614  for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2615  { \
2616  ptr[i2] = ptra[i]; \
2617  ptr[i2+1] = ptrb[i]; \
2618  } \
2619 } \
2620 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2621  const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2622 { \
2623  int i, i3; \
2624  _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2625  _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2626  _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2627  v_store(ptra, a); \
2628  v_store(ptrb, b); \
2629  v_store(ptrc, c); \
2630  for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2631  { \
2632  ptr[i3] = ptra[i]; \
2633  ptr[i3+1] = ptrb[i]; \
2634  ptr[i3+2] = ptrc[i]; \
2635  } \
2636 } \
2637 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2638  const v_##_Tpvec& c, const v_##_Tpvec& d, \
2639  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2640 { \
2641  int i, i4; \
2642  _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2643  _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2644  _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2645  _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
2646  v_store(ptra, a); \
2647  v_store(ptrb, b); \
2648  v_store(ptrc, c); \
2649  v_store(ptrd, d); \
2650  for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2651  { \
2652  ptr[i4] = ptra[i]; \
2653  ptr[i4+1] = ptrb[i]; \
2654  ptr[i4+2] = ptrc[i]; \
2655  ptr[i4+3] = ptrd[i]; \
2656  } \
2657 } \
2658 inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
2659 { \
2660  _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
2661  _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
2662  v_store(ptrvec, vec); \
2663  for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
2664  { \
2665  ptr[4*i ] = ptrvec[4*i ]; \
2666  ptr[4*i+1] = ptrvec[4*i+2]; \
2667  ptr[4*i+2] = ptrvec[4*i+1]; \
2668  ptr[4*i+3] = ptrvec[4*i+3]; \
2669  } \
2670  return v_load(ptr); \
2671 } \
2672 inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
2673 { \
2674  _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
2675  _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
2676  v_store(ptrvec, vec); \
2677  for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
2678  { \
2679  ptr[8*i ] = ptrvec[8*i ]; \
2680  ptr[8*i+1] = ptrvec[8*i+4]; \
2681  ptr[8*i+2] = ptrvec[8*i+1]; \
2682  ptr[8*i+3] = ptrvec[8*i+5]; \
2683  ptr[8*i+4] = ptrvec[8*i+2]; \
2684  ptr[8*i+5] = ptrvec[8*i+6]; \
2685  ptr[8*i+6] = ptrvec[8*i+3]; \
2686  ptr[8*i+7] = ptrvec[8*i+7]; \
2687  } \
2688  return v_load(ptr); \
2689 }
2690 
2691 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar)
2692 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar)
2693 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort)
2694 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short)
2695 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned)
2696 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int)
2697 OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float)
2698 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64)
2699 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64)
2700 #if CV_SIMD128_64F
2701 OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double)
2702 #endif
2703 
2705 
2706 static const unsigned char popCountTable[] =
2707 {
2708  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2709  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2710  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2711  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2712  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2713  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2714  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2715  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2716  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2717  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2718  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2719  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2720  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2721  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2722  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2723  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2724 };
2725 
2726 #define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
2727 inline _rTpvec v_popcount(const _Tpvec& a) \
2728 { \
2729  uchar ptra[16] = {0}; \
2730  v_store(ptra, v_reinterpret_as_u8(a)); \
2731  _rTp ptr[_Tpvec::nlanes] = {0}; \
2732  v_store(ptr, v_setzero_##suffix()); \
2733  for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
2734  ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
2735  return v_load(ptr); \
2736 }
2737 
2738 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_uint8x16, uchar, uchar, u8)
2739 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_int8x16, uchar, schar, u8)
2740 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_uint16x8, ushort, ushort, u16)
2741 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_int16x8, ushort, short, u16)
2742 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_uint32x4, unsigned, unsigned, u32)
2743 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_int32x4, unsigned, int, u32)
2744 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_uint64x2, uint64, uint64, u64)
2745 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64)
2746 
2747 
2749 #ifndef __clang__
2750 #define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, vl, shift) \
2751 inline int v_signmask(const _Tpvec& a) \
2752 { \
2753  int mask = 0; \
2754  _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift, vl)); \
2755  for( int i = 0; i < _Tpvec::nlanes; i++ ) \
2756  mask |= (int)(tmp.val[i]) << i; \
2757  return mask; \
2758 }
2759 
2760 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 16, 7)
2761 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 8, 15)
2762 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 4, 31)
2763 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 2, 63)
2764 
2765 inline int v_signmask(const v_int8x16& a)
2766 { return v_signmask(v_reinterpret_as_u8(a)); }
2767 inline int v_signmask(const v_int16x8& a)
2768 { return v_signmask(v_reinterpret_as_u16(a)); }
2769 inline int v_signmask(const v_int32x4& a)
2770 { return v_signmask(v_reinterpret_as_u32(a)); }
2771 inline int v_signmask(const v_float32x4& a)
2772 { return v_signmask(v_reinterpret_as_u32(a)); }
2773 inline int v_signmask(const v_int64x2& a)
2774 { return v_signmask(v_reinterpret_as_u64(a)); }
2775 #if CV_SIMD128_64F
2776 inline int v_signmask(const v_float64x2& a)
2777 { return v_signmask(v_reinterpret_as_u64(a)); }
2778 #endif
2779 
2780 #else
2781 #define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, width, vl) \
2782 inline int v_signmask(const _Tpvec& a) \
2783 { \
2784  uint8_t ans[16] = {0};\
2785  vsm(ans, vmslt(a, 0, vl), vl);\
2786  return reinterpret_cast<int*>(ans)[0] & ((1 << (vl)) - 1);\
2787 }
2788 
2789 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8x16, 8, 16)
2790 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16x8, 16, 8)
2791 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32x4, 32, 4)
2792 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64x2, 64, 2)
2793 
2794 inline int v_signmask(const v_uint8x16& a)
2795 { return v_signmask(v_reinterpret_as_s8(a)); }
2796 inline int v_signmask(const v_uint16x8& a)
2797 { return v_signmask(v_reinterpret_as_s16(a)); }
2798 inline int v_signmask(const v_uint32x4& a)
2799 { return v_signmask(v_reinterpret_as_s32(a)); }
2800 inline int v_signmask(const v_float32x4& a)
2801 { return v_signmask(v_reinterpret_as_s32(a)); }
2802 inline int v_signmask(const v_uint64x2& a)
2803 { return v_signmask(v_reinterpret_as_s64(a)); }
2804 #if CV_SIMD128_64F
2805 inline int v_signmask(const v_float64x2& a)
2806 { return v_signmask(v_reinterpret_as_s64(a)); }
2807 #endif
2808 
2809 #endif
2810 
2812 
2813 #define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
2814 inline int v_scan_forward(const _Tpvec& a) \
2815 { \
2816  _Tp ptr[_Tpvec::nlanes] = {0}; \
2817  v_store(ptr, v_reinterpret_as_##suffix(a)); \
2818  for (int i = 0; i < _Tpvec::nlanes; i++) \
2819  if(int(ptr[i]) < 0) \
2820  return i; \
2821  return 0; \
2822 }
2823 
2824 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16, uchar, u8)
2825 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int8x16, schar, s8)
2826 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint16x8, ushort, u16)
2827 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int16x8, short, s16)
2828 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint32x4, unsigned, u32)
2829 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int32x4, int, s32)
2830 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float32x4, float, f32)
2831 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint64x2, uint64, u64)
2832 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int64x2, int64, s64)
2833 #if CV_SIMD128_64F
2834 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
2835 #endif
2836 
2838 
2839 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2840 {
2841  const uint64 ptr[2] = {0x0908060504020100, 0xFFFFFF0F0E0D0C0A};
2842  const v_uint64x2 flags(vle64_v_u64m1(ptr, 2));
2843  return v_reinterpret_as_s8(v_uint8x16(
2844  vrgather_vv_u8m1(
2845  v_reinterpret_as_u8(vec),
2846  v_reinterpret_as_u8(flags),
2847  16)));
2848 }
2849 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
2850 {
2851  return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec)));
2852 }
2853 
2854 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2855 {
2856  const uint64 ptr[2] = {0x0908050403020100, 0xFFFF0F0E0D0C0B0A};
2857  const v_uint64x2 flags(vle64_v_u64m1(ptr, 2));
2858  return v_reinterpret_as_s16(v_uint8x16(
2859  vrgather_vv_u8m1(
2860  v_reinterpret_as_u8(vec),
2861  v_reinterpret_as_u8(flags),
2862  16)));
2863 }
2864 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
2865 {
2866  return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec)));
2867 }
2868 
2869 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2870 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2871 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2872 
2874 
2875 #if CV_FP16
2876 inline v_float32x4 v_load_expand(const hfloat* ptr)
2877 {
2878  return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
2879 }
2880 
2881 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2882 {
2883  vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
2884 }
2885 #else
2886 inline v_float32x4 v_load_expand(const hfloat* ptr)
2887 {
2888  const int N = 4;
2889  float buf[N];
2890  for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2891  return v_load(buf);
2892 }
2893 
2894 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2895 {
2896  const int N = 4;
2897  float buf[N];
2898  v_store(buf, v);
2899  for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
2900 }
2901 #endif
2902 
2904 
2905 inline v_int32x4 v_round(const v_float32x4& a)
2906 {
2907  return v_int32x4(vfcvt_x_f_v_i32m1(a, 4));
2908 }
2909 
2910 inline v_int32x4 v_floor(const v_float32x4& a)
2911 {
2912  v_float32x4 ZP5 = v_setall_f32(0.5f);
2913  v_float32x4 t = a - ZP5;
2914  return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
2915 }
2916 
2917 inline v_int32x4 v_ceil(const v_float32x4& a)
2918 {
2919  v_float32x4 ZP5 = v_setall_f32(0.5f);
2920  v_float32x4 t = a + ZP5;
2921  return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
2922 }
2923 
2924 inline v_int32x4 v_trunc(const v_float32x4& a)
2925 {
2926 #ifndef CV_RVV_THEAD_0_7
2927  return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a, 4));
2928 #else
2929  const int old_round = fesetround(FE_TOWARDZERO);
2930  vint32m1_t val = vfcvt_x_f_v_i32m1(a, 4);
2931  fesetround(old_round);
2932  return v_int32x4(val);
2933 #endif
2934 }
2935 #if CV_SIMD128_64F
2936 #ifndef __clang__
2937 inline v_int32x4 v_round(const v_float64x2& a)
2938 {
2939  double arr[4] = {a.val[0], a.val[1], 0, 0};
2940  vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2941  return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2942 }
2943 
2944 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2945 {
2946  double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2947  vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2948  return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2949 }
2950 
2951 inline v_int32x4 v_floor(const v_float64x2& a)
2952 {
2953  double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
2954  vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2955  return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2956 }
2957 
2958 inline v_int32x4 v_ceil(const v_float64x2& a)
2959 {
2960  double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
2961  vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2962  return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2963 }
2964 
2965 inline v_int32x4 v_trunc(const v_float64x2& a)
2966 {
2967  double arr[4] = {a.val[0], a.val[1], 0, 0};
2968  vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2969 #ifndef CV_RVV_THEAD_0_7
2970  return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp, 4));
2971 #else
2972  const int old_round = fesetround(FE_TOWARDZERO);
2973  vint32m1_t val = vfncvt_x_f_w_i32m1(tmp, 4);
2974  fesetround(old_round);
2975  return v_int32x4(val);
2976 #endif
2977 }
2978 
2979 #else
2980 inline v_int32x4 v_round(const v_float64x2& a)
2981 {
2982  vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
2983  return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
2984 }
2985 
2986 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2987 {
2988  vfloat64m2_t dst = vlmul_ext_v_f64m1_f64m2(a);
2989  return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(dst, 1, b), 4));
2990 }
2991 
2992 inline v_int32x4 v_floor(const v_float64x2& a)
2993 {
2994  vfloat64m2_t dst = vfmv_v_f_f64m2(0, 4);
2995  dst = vset_v_f64m1_f64m2(dst, 0, a);
2996  dst = vfsub_vf_f64m2(dst, 0.5, 2);
2997  return v_int32x4(vfncvt_x_f_w_i32m1(dst, 4));
2998 }
2999 
3000 inline v_int32x4 v_ceil(const v_float64x2& a)
3001 {
3002  vfloat64m2_t dst = vfmv_v_f_f64m2(0, 4);
3003  dst = vset_v_f64m1_f64m2(dst, 0, a);
3004  dst = vfadd_vf_f64m2(dst, 0.5, 2);
3005  return v_int32x4(vfncvt_x_f_w_i32m1(dst, 4));
3006 }
3007 
3008 inline v_int32x4 v_trunc(const v_float64x2& a)
3009 {
3010  vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
3011  return v_int32x4(vfncvt_rtz_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
3012 }
3013 #endif
3014 #endif
3015 
3016 
3018 
3019 // 16 >> 32
3020 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
3021 {
3022  int ptr[8] = {0};
3023  v_int32x4 t1, t2;
3024  vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3025  v_load_deinterleave(ptr, t1, t2);
3026  return t1 + t2;
3027 }
3028 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
3029 {
3030  int ptr[8] = {0};
3031  v_int32x4 t1, t2;
3032  vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3033  v_load_deinterleave(ptr, t1, t2);
3034  return t1 + t2 + c;
3035 }
3036 
3037 // 32 >> 64
3038 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
3039 {
3040  int64 ptr[4] = {0};
3041  v_int64x2 t1, t2;
3042  vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3043  v_load_deinterleave(ptr, t1, t2);
3044  return t1 + t2;
3045 }
3046 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
3047 {
3048  int64 ptr[4] = {0};
3049  v_int64x2 t1, t2;
3050  vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3051  v_load_deinterleave(ptr, t1, t2);
3052  return t1 + t2 + c;
3053 }
3054 
3055 // 8 >> 32
3056 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
3057 {
3058  unsigned ptr[16] = {0};
3059  v_uint32x4 t1, t2, t3, t4;
3060  vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3061  v_load_deinterleave(ptr, t1, t2, t3, t4);
3062  return t1 + t2 + t3 + t4;
3063 }
3064 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
3065  const v_uint32x4& c)
3066 {
3067  unsigned ptr[16] = {0};
3068  v_uint32x4 t1, t2, t3, t4;
3069  vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3070  v_load_deinterleave(ptr, t1, t2, t3, t4);
3071  return t1 + t2 + t3 + t4 + c;
3072 }
3073 
3074 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
3075 {
3076  int ptr[16] = {0};
3077  v_int32x4 t1, t2, t3, t4;
3078  vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3079  v_load_deinterleave(ptr, t1, t2, t3, t4);
3080  return t1 + t2 + t3 + t4;
3081 }
3082 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
3083  const v_int32x4& c)
3084 {
3085  int ptr[16] = {0};
3086  v_int32x4 t1, t2, t3, t4;
3087  vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3088  v_load_deinterleave(ptr, t1, t2, t3, t4);
3089  return t1 + t2 + t3 + t4 + c;
3090 }
3091 
3092 // 16 >> 64
3093 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
3094 {
3095  uint64 ptr[8] = {0};
3096  v_uint64x2 t1, t2, t3, t4;
3097  vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3098  v_load_deinterleave(ptr, t1, t2, t3, t4);
3099  return t1 + t2 + t3 + t4;
3100 }
3101 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
3102 {
3103  uint64 ptr[8] = {0};
3104  v_uint64x2 t1, t2, t3, t4;
3105  vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3106  v_load_deinterleave(ptr, t1, t2, t3, t4);
3107  return t1 + t2 + t3 + t4 + c;
3108 }
3109 
3110 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
3111 {
3112  int64 ptr[8] = {0};
3113  v_int64x2 t1, t2, t3, t4;
3114  vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3115  v_load_deinterleave(ptr, t1, t2, t3, t4);
3116  return t1 + t2 + t3 + t4;
3117 }
3118 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
3119  const v_int64x2& c)
3120 {
3121  int64 ptr[8] = {0};
3122  v_int64x2 t1, t2, t3, t4;
3123  vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3124  v_load_deinterleave(ptr, t1, t2, t3, t4);
3125  return t1 + t2 + t3 + t4 + c;
3126 }
3127 
3128 // 32 >> 64f
3129 #if CV_SIMD128_64F
3130 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
3131 { return v_cvt_f64(v_dotprod(a, b)); }
3132 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
3133  const v_float64x2& c)
3134 { return v_dotprod_expand(a, b) + c; }
3135 #endif
3136 
3138 
3139 // 16 >> 32
3140 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
3141 {
3142  int ptr[8] = {0};
3143  vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3144  v_int32x4 t1 = v_load(ptr);
3145  v_int32x4 t2 = v_load(ptr+4);
3146  return t1 + t2;
3147 }
3148 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
3149 {
3150  int ptr[8] = {0};
3151  vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3152  v_int32x4 t1 = v_load(ptr);
3153  v_int32x4 t2 = v_load(ptr+4);
3154  return t1 + t2 + c;
3155 }
3156 
3157 // 32 >> 64
3158 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
3159 {
3160  int64 ptr[4] = {0};
3161  vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3162  v_int64x2 t1 = v_load(ptr);
3163  v_int64x2 t2 = v_load(ptr+2);
3164  return t1 + t2;
3165 }
3166 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
3167 {
3168  int64 ptr[4] = {0};
3169  vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3170  v_int64x2 t1 = v_load(ptr);
3171  v_int64x2 t2 = v_load(ptr+2);
3172  return t1 + t2 + c;
3173 }
3174 
3175 
3176 // 8 >> 32
3177 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
3178 {
3179  unsigned ptr[16] = {0};
3180  vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3181  v_uint32x4 t1 = v_load(ptr);
3182  v_uint32x4 t2 = v_load(ptr+4);
3183  v_uint32x4 t3 = v_load(ptr+8);
3184  v_uint32x4 t4 = v_load(ptr+12);
3185  return t1 + t2 + t3 + t4;
3186 }
3187 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
3188 {
3189  unsigned ptr[16] = {0};
3190  vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3191  v_uint32x4 t1 = v_load(ptr);
3192  v_uint32x4 t2 = v_load(ptr+4);
3193  v_uint32x4 t3 = v_load(ptr+8);
3194  v_uint32x4 t4 = v_load(ptr+12);
3195  return t1 + t2 + t3 + t4 + c;
3196 }
3197 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
3198 {
3199  int ptr[16] = {0};
3200  vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3201  v_int32x4 t1 = v_load(ptr);
3202  v_int32x4 t2 = v_load(ptr+4);
3203  v_int32x4 t3 = v_load(ptr+8);
3204  v_int32x4 t4 = v_load(ptr+12);
3205  return t1 + t2 + t3 + t4;
3206 }
3207 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
3208 {
3209  int ptr[16] = {0};
3210  vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3211  v_int32x4 t1 = v_load(ptr);
3212  v_int32x4 t2 = v_load(ptr+4);
3213  v_int32x4 t3 = v_load(ptr+8);
3214  v_int32x4 t4 = v_load(ptr+12);
3215  return t1 + t2 + t3 + t4 + c;
3216 }
3217 
3218 // 16 >> 64
3219 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
3220 {
3221  uint64 ptr[8] = {0};
3222  vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3223  v_uint64x2 t1 = v_load(ptr);
3224  v_uint64x2 t2 = v_load(ptr+2);
3225  v_uint64x2 t3 = v_load(ptr+4);
3226  v_uint64x2 t4 = v_load(ptr+6);
3227  return t1 + t2 + t3 + t4;
3228 }
3229 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
3230 {
3231  uint64 ptr[8] = {0};
3232  vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3233  v_uint64x2 t1 = v_load(ptr);
3234  v_uint64x2 t2 = v_load(ptr+2);
3235  v_uint64x2 t3 = v_load(ptr+4);
3236  v_uint64x2 t4 = v_load(ptr+6);
3237  return t1 + t2 + t3 + t4 + c;
3238 }
3239 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
3240 {
3241  int64 ptr[8] = {0};
3242  vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3243  v_int64x2 t1 = v_load(ptr);
3244  v_int64x2 t2 = v_load(ptr+2);
3245  v_int64x2 t3 = v_load(ptr+4);
3246  v_int64x2 t4 = v_load(ptr+6);
3247  return t1 + t2 + t3 + t4;
3248 }
3249 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
3250 {
3251  int64 ptr[8] = {0};
3252  vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3253  v_int64x2 t1 = v_load(ptr);
3254  v_int64x2 t2 = v_load(ptr+2);
3255  v_int64x2 t3 = v_load(ptr+4);
3256  v_int64x2 t4 = v_load(ptr+6);
3257  return t1 + t2 + t3 + t4 + c;
3258 }
3259 
3260 // 32 >> 64f
3261 #if CV_SIMD128_64F
3262 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
3263 { return v_cvt_f64(v_dotprod_fast(a, b)); }
3264 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
3265 { return v_dotprod_expand_fast(a, b) + c; }
3266 #endif
3267 
3268 
3269 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
3270  const v_float32x4& m1, const v_float32x4& m2,
3271  const v_float32x4& m3)
3272 {
3273  vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
3274  res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
3275  res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
3276  res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3, 4);
3277  return v_float32x4(res);
3278 }
3279 
3280 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
3281  const v_float32x4& m1, const v_float32x4& m2,
3282  const v_float32x4& a)
3283 {
3284  vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
3285  res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
3286  res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
3287  return v_float32x4(res) + a;
3288 }
3289 
3290 #define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width, vl, hvl) \
3291 inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
3292 { \
3293  _Tpw ptr[_Tpwvec::nlanes*2] = {0}; \
3294  vse##width##_v_##suffix##m2(ptr, wmul(a, b, vl), vl); \
3295  c = _Tpwvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
3296  d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes, hvl)); \
3297 }
3298 
3299 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16, 16, 8)
3300 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16, 16, 8)
3301 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32, 8, 4)
3302 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32, 8, 4)
3303 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64, 4, 2)
3304 
3305 
3306 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
3307 {
3308  return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b, 8), 16, 8));
3309 }
3310 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
3311 {
3312  return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b, 8), 16, 8));
3313 }
3314 
3315 
3317 
3318 #define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
3319 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
3320 { \
3321  _wTpvec c, d; \
3322  v_mul_expand(a, b, c, d); \
3323  return v_pack(c, d); \
3324 } \
3325 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
3326 { \
3327  a = a * b; \
3328  return a; \
3329 }
3330 
3331 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16, v_uint16x8)
3332 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8x16, v_int16x8)
3333 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16x8, v_uint32x4)
3334 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16x8, v_int32x4)
3335 
3336 
3337 inline void v_cleanup() {}
3338 
3339 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3340 
3342 
3343 } // namespace cv
3344 
3345 #endif
T fesetround(T... args)
InputArrayOfArrays Size InputOutputArray InputOutputArray OutputArrayOfArrays OutputArrayOfArrays OutputArray OutputArray OutputArray int flags
Definition: calib3d.hpp:1617
CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst)
Calculates the per-element absolute difference between two arrays or between an array and a scalar.
const int * idx
Definition: core_c.h:668
CvArr * arr
Definition: core_c.h:1247
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition: intrin_cpp.hpp:1584
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition: intrin_cpp.hpp:2190
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition: intrin_cpp.hpp:1020
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition: intrin_cpp.hpp:1033
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_halves(const _Tp *loptr, const _Tp *hiptr)
Load register contents from two memory blocks.
Definition: intrin_cpp.hpp:1781
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
OutputArray dst
Definition: imgproc.hpp:3564
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437