EstervQrCode 1.1.1
Library for qr code manipulation
intrin_rvv_scalable.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 // The original implementation is contributed by HAN Liutong.
6 // Copyright (C) 2022, Institute of Software, Chinese Academy of Sciences.
7 
8 #ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
9 #define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
10 
11 #include <opencv2/core/check.hpp>
12 
13 // RVV intrinsics have been renamed in version 0.11, so we need to include
14 // compatibility headers:
15 // https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
16 #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>10999
17 #include "intrin_rvv_010_compat_non-policy.hpp"
18 #include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
19 #endif
20 
21 #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
22 #include "intrin_rvv_011_compat.hpp"
23 #endif
24 
25 #if defined(__GNUC__) && !defined(__clang__)
26 // FIXIT: eliminate massive warnigs from templates
27 // GCC from 'rvv-next': riscv64-unknown-linux-gnu-g++ (g42df3464463) 12.0.1 20220505 (prerelease)
28 // doesn't work: #pragma GCC diagnostic push
29 #pragma GCC diagnostic ignored "-Wignored-attributes"
30 #endif
31 
32 #ifndef CV_RVV_MAX_VLEN
33 #define CV_RVV_MAX_VLEN 1024
34 #endif
35 
36 namespace cv
37 {
38 
40 
41 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
42 
43 #define CV_SIMD_SCALABLE 1
44 #define CV_SIMD_SCALABLE_64F 1
45 
46 using v_uint8 = vuint8m1_t;
47 using v_int8 = vint8m1_t;
48 using v_uint16 = vuint16m1_t;
49 using v_int16 = vint16m1_t;
50 using v_uint32 = vuint32m1_t;
51 using v_int32 = vint32m1_t;
52 using v_uint64 = vuint64m1_t;
53 using v_int64 = vint64m1_t;
54 
55 using v_float32 = vfloat32m1_t;
56 #if CV_SIMD_SCALABLE_64F
57 using v_float64 = vfloat64m1_t;
58 #endif
59 
60 using uchar = unsigned char;
61 using schar = signed char;
62 using ushort = unsigned short;
63 using uint = unsigned int;
64 using uint64 = unsigned long int;
65 using int64 = long int;
66 
67 static const int __cv_rvv_e8m1_nlanes = vsetvlmax_e8m1();
68 static const int __cv_rvv_e16m1_nlanes = vsetvlmax_e16m1();
69 static const int __cv_rvv_e32m1_nlanes = vsetvlmax_e32m1();
70 static const int __cv_rvv_e64m1_nlanes = vsetvlmax_e64m1();
71 static const int __cv_rvv_e8m2_nlanes = vsetvlmax_e8m2();
72 static const int __cv_rvv_e16m2_nlanes = vsetvlmax_e16m2();
73 static const int __cv_rvv_e32m2_nlanes = vsetvlmax_e32m2();
74 static const int __cv_rvv_e64m2_nlanes = vsetvlmax_e64m2();
75 static const int __cv_rvv_e8m4_nlanes = vsetvlmax_e8m4();
76 static const int __cv_rvv_e16m4_nlanes = vsetvlmax_e16m4();
77 static const int __cv_rvv_e32m4_nlanes = vsetvlmax_e32m4();
78 static const int __cv_rvv_e64m4_nlanes = vsetvlmax_e64m4();
79 static const int __cv_rvv_e8m8_nlanes = vsetvlmax_e8m8();
80 static const int __cv_rvv_e16m8_nlanes = vsetvlmax_e16m8();
81 static const int __cv_rvv_e32m8_nlanes = vsetvlmax_e32m8();
82 static const int __cv_rvv_e64m8_nlanes = vsetvlmax_e64m8();
83 
84 template <class T>
85 struct VTraits;
86 
87 #define OPENCV_HAL_IMPL_RVV_TRAITS(REG, TYP, SUF, SZ) \
88 template <> \
89 struct VTraits<REG> \
90 { \
91  static inline int vlanes() { return __cv_rvv_##SUF##_nlanes; } \
92  using lane_type = TYP; \
93  static const int max_nlanes = CV_RVV_MAX_VLEN/SZ; \
94 };
95 
96 OPENCV_HAL_IMPL_RVV_TRAITS(vint8m1_t, int8_t, e8m1, 8)
97 OPENCV_HAL_IMPL_RVV_TRAITS(vint8m2_t, int8_t, e8m2, 8)
98 OPENCV_HAL_IMPL_RVV_TRAITS(vint8m4_t, int8_t, e8m4, 8)
99 OPENCV_HAL_IMPL_RVV_TRAITS(vint8m8_t, int8_t, e8m8, 8)
100 OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m1_t, uint8_t, e8m1, 8)
101 OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m2_t, uint8_t, e8m2, 8)
102 OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m4_t, uint8_t, e8m4, 8)
103 OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m8_t, uint8_t, e8m8, 8)
104 
105 OPENCV_HAL_IMPL_RVV_TRAITS(vint16m1_t, int16_t, e16m1, 16)
106 OPENCV_HAL_IMPL_RVV_TRAITS(vint16m2_t, int16_t, e16m2, 16)
107 OPENCV_HAL_IMPL_RVV_TRAITS(vint16m4_t, int16_t, e16m4, 16)
108 OPENCV_HAL_IMPL_RVV_TRAITS(vint16m8_t, int16_t, e16m8, 16)
109 OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m1_t, uint16_t, e16m1, 16)
110 OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m2_t, uint16_t, e16m2, 16)
111 OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m4_t, uint16_t, e16m4, 16)
112 OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m8_t, uint16_t, e16m8, 16)
113 
114 OPENCV_HAL_IMPL_RVV_TRAITS(vint32m1_t, int32_t, e32m1, 32)
115 OPENCV_HAL_IMPL_RVV_TRAITS(vint32m2_t, int32_t, e32m2, 32)
116 OPENCV_HAL_IMPL_RVV_TRAITS(vint32m4_t, int32_t, e32m4, 32)
117 OPENCV_HAL_IMPL_RVV_TRAITS(vint32m8_t, int32_t, e32m8, 32)
118 OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m1_t, uint32_t, e32m1, 32)
119 OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m2_t, uint32_t, e32m2, 32)
120 OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m4_t, uint32_t, e32m4, 32)
121 OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m8_t, uint32_t, e32m8, 32)
122 
123 OPENCV_HAL_IMPL_RVV_TRAITS(vint64m1_t, int64_t, e64m1, 64)
124 OPENCV_HAL_IMPL_RVV_TRAITS(vint64m2_t, int64_t, e64m2, 64)
125 OPENCV_HAL_IMPL_RVV_TRAITS(vint64m4_t, int64_t, e64m4, 64)
126 OPENCV_HAL_IMPL_RVV_TRAITS(vint64m8_t, int64_t, e64m8, 64)
127 OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m1_t, uint64_t, e64m1, 64)
128 OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m2_t, uint64_t, e64m2, 64)
129 OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m4_t, uint64_t, e64m4, 64)
130 OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m8_t, uint64_t, e64m8, 64)
131 
132 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m1_t, float, e32m1, 32)
133 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m2_t, float, e32m2, 32)
134 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m4_t, float, e32m4, 32)
135 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m8_t, float, e32m8, 32)
136 
137 #if CV_SIMD_SCALABLE_64F
138 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m1_t, double, e64m1, 64)
139 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m2_t, double, e64m2, 64)
140 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m4_t, double, e64m4, 64)
141 OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m8_t, double, e64m8, 64)
142 #endif
143 
144 
145 // LLVM/Clang defines "overloaded intrinsics" e.g. 'vand(op1, op2)'
146 // GCC does not have these functions, so we need to implement them manually
147 // We implement only selected subset required to build current state of the code
148 // Included inside namespace cv::
149 #ifndef __riscv_v_intrinsic_overloading
150 #include "intrin_rvv_compat_overloaded.hpp"
151 #endif // __riscv_v_intrinsic_overloading
152 
153 
155 #define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \
156 inline _Tp v_get0(const v_##_Tpvec& v) \
157 { \
158  return vmv_x(v); \
159 }
160 
161 OPENCV_HAL_IMPL_RVV_GRT0_INT(uint8, uchar)
162 OPENCV_HAL_IMPL_RVV_GRT0_INT(int8, schar)
163 OPENCV_HAL_IMPL_RVV_GRT0_INT(uint16, ushort)
164 OPENCV_HAL_IMPL_RVV_GRT0_INT(int16, short)
165 OPENCV_HAL_IMPL_RVV_GRT0_INT(uint32, unsigned)
166 OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
167 OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
168 OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
169 
170 inline float v_get0(const v_float32& v) \
171 { \
172  return vfmv_f(v); \
173 }
174 #if CV_SIMD_SCALABLE_64F
175 inline double v_get0(const v_float64& v) \
176 { \
177  return vfmv_f(v); \
178 }
179 #endif
180 
182 
183 #define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
184 inline v_##_Tpvec v_setzero_##suffix1() \
185 { \
186  return vmv_v_x_##suffix2##m1(0, vl); \
187 } \
188 inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
189 { \
190  return vmv_v_x_##suffix2##m1(v, vl); \
191 }
192 
193 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits<v_uint8>::vlanes())
194 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8, schar, s8, i8, VTraits<v_int8>::vlanes())
195 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16, ushort, u16, u16, VTraits<v_uint16>::vlanes())
196 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16, short, s16, i16, VTraits<v_int16>::vlanes())
197 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32, uint, u32, u32, VTraits<v_uint32>::vlanes())
198 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32, int, s32, i32, VTraits<v_int32>::vlanes())
199 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64, uint64, u64, u64, VTraits<v_uint64>::vlanes())
200 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits<v_int64>::vlanes())
201 
202 #define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
203 inline v_##_Tpv v_setzero_##suffix() \
204 { \
205  return vfmv_v_f_##suffix##m1(0, vl); \
206 } \
207 inline v_##_Tpv v_setall_##suffix(_Tp v) \
208 { \
209  return vfmv_v_f_##suffix##m1(v, vl); \
210 }
211 
212 OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
213 #if CV_SIMD_SCALABLE_64F
214 OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
215 #endif
216 
218 #define OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(_Tpvec1, suffix1) \
219 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec1& v) \
220 { \
221  return v;\
222 }
223 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint8, u8)
224 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint16, u16)
225 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint32, u32)
226 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint64, u64)
227 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
228 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
229 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
230 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
231 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
232 #if CV_SIMD_SCALABLE_64F
233 OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
234 #endif
235 // TODO: can be simplified by using overloaded RV intrinsic
236 #define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
237 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
238 { \
239  return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
240 } \
241 inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
242 { \
243  return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
244 }
245 
246 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
247 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
248 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
249 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
250 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
251 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
252 #if CV_SIMD_SCALABLE_64F
253 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, float64, u64, f64, u64, f64)
254 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64, float64, s64, f64, i64, f64)
255 #endif
256 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint16, u8, u16, u8, u16)
257 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint32, u8, u32, u8, u32)
258 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint64, u8, u64, u8, u64)
259 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint32, u16, u32, u16, u32)
260 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint64, u16, u64, u16, u64)
261 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, uint64, u32, u64, u32, u64)
262 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int16, s8, s16, i8, i16)
263 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int32, s8, s32, i8, i32)
264 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int64, s8, s64, i8, i64)
265 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int32, s16, s32, i16, i32)
266 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int64, s16, s64, i16, i64)
267 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64)
268 
269 
270 #define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
271 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
272 { \
273  return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v));\
274 } \
275 inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
276 { \
277  return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v));\
278 }
279 
280 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16)
281 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int32, u8, s32, u, i, 8, 32)
282 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int64, u8, s64, u, i, 8, 64)
283 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int8, u16, s8, u, i, 16, 8)
284 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int32, u16, s32, u, i, 16, 32)
285 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int64, u16, s64, u, i, 16, 64)
286 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int8, u32, s8, u, i, 32, 8)
287 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int16, u32, s16, u, i, 32, 16)
288 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
289 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
290 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
291 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
292 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
293 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
294 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
295 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float32, s8, f32, i, f, 8, 32)
296 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float32, s16, f32, i, f, 16, 32)
297 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float32, s64, f32, i, f, 64, 32)
298 #if CV_SIMD_SCALABLE_64F
299 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float64, u8, f64, u, f, 8, 64)
300 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float64, u16, f64, u, f, 16, 64)
301 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float64, u32, f64, u, f, 32, 64)
302 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
303 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
304 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
305 // Three times reinterpret
306 inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
307 { \
308  return vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v)));\
309 }
310 
311 inline v_float64 v_reinterpret_as_f64(const v_float32& v) \
312 { \
313  return vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v)));\
314 }
315 #endif
316 
318 
319 #define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vl) \
320 template <int s = 0> \
321 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
322 { \
323  return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
324 } \
325 template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
326 { \
327  return vmv_x(vslidedown(v_setzero_##suffix(), v, i, vl)); \
328 }
329 
330 
331 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8, uchar, u8, VTraits<v_uint8>::vlanes())
332 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8, schar, s8, VTraits<v_int8>::vlanes())
333 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16, ushort, u16, VTraits<v_uint16>::vlanes())
334 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16, short, s16, VTraits<v_int16>::vlanes())
335 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32, unsigned int, u32, VTraits<v_uint32>::vlanes())
336 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32, int, s32, VTraits<v_int32>::vlanes())
337 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64, uint64, u64, VTraits<v_uint64>::vlanes())
338 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64, int64, s64, VTraits<v_int64>::vlanes())
339 
340 #define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vl) \
341 template <int s = 0> \
342 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
343 { \
344  return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
345 } \
346 template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
347 { \
348  return vfmv_f(vslidedown(v_setzero_##suffix(), v, i, vl)); \
349 }
350 
351 OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32, float, f32, VTraits<v_float32>::vlanes())
352 #if CV_SIMD_SCALABLE_64F
353 OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, f64, VTraits<v_float64>::vlanes())
354 #endif
355 
356 #define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, vl) \
357 inline _Tp v_extract_highest(_Tpvec v) \
358 { \
359  return v_extract_n(v, vl-1); \
360 }
361 
362 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::vlanes())
363 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8, schar, VTraits<v_int8>::vlanes())
364 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::vlanes())
365 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16, short, VTraits<v_int16>::vlanes())
366 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::vlanes())
367 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32, int, VTraits<v_int32>::vlanes())
368 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::vlanes())
369 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64, int64, VTraits<v_int64>::vlanes())
370 OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32, float, VTraits<v_float32>::vlanes())
371 #if CV_SIMD_SCALABLE_64F
372 OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits<v_float64>::vlanes())
373 #endif
374 
375 
377 #define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
378 inline _Tpvec v_load(const _Tp* ptr) \
379 { \
380  return vle##width##_v_##suffix##m1(ptr, vl); \
381 } \
382 inline _Tpvec v_load_aligned(const _Tp* ptr) \
383 { \
384  return vle##width##_v_##suffix##m1(ptr, vl); \
385 } \
386 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
387 { \
388  vse##width##_v_##suffix##m1(ptr, a, vl); \
389 } \
390 inline _Tpvec v_load_low(const _Tp* ptr) \
391 { \
392  return vle##width##_v_##suffix##m1(ptr, hvl); \
393 } \
394 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
395 { \
396  return vslideup(vle##width##_v_##suffix##m1(ptr0, hvl), vle##width##_v_##suffix##m1(ptr1, hvl), hvl, vl); \
397 } \
398 inline void v_store(_Tp* ptr, const _Tpvec& a) \
399 { \
400  vse##width(ptr, a, vl); \
401 } \
402 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
403 { \
404  vse##width(ptr, a, vl); \
405 } \
406 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
407 { \
408  vse##width(ptr, a, vl); \
409 } \
410 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
411 { \
412  vse##width(ptr, a, hvl); \
413 } \
414 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
415 { \
416  vse##width(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
417 } \
418 template<typename... Targs> \
419 _Tpvec v_load_##suffix(Targs... nScalars) \
420 { \
421  return v_load({nScalars...}); \
422 }
423 
424 
425 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8, vmv_v_x_u8m1)
426 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8, vmv_v_x_i8m1)
427 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m1_t, ushort, VTraits<v_uint16>::vlanes() / 2, VTraits<v_uint16>::vlanes(), 16, u16, vmv_v_x_u16m1)
428 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m1_t, short, VTraits<v_int16>::vlanes() / 2, VTraits<v_int16>::vlanes(), 16, i16, vmv_v_x_i16m1)
429 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits<v_uint32>::vlanes() / 2, VTraits<v_uint32>::vlanes(), 32, u32, vmv_v_x_u32m1)
430 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32, vmv_v_x_i32m1)
431 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64, vmv_v_x_u64m1)
432 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64, vmv_v_x_i64m1)
433 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32, vfmv_v_f_f32m1)
434 
435 #if CV_SIMD_SCALABLE_64F
436 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64, vfmv_v_f_f64m1)
437 #endif
438 
440 #define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
441 inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
442 { \
443  auto vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
444  return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
445 }
446 OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
447 OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
448 OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
449 OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
450 OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
451 #if CV_SIMD_SCALABLE_64F
452 OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
453 #endif
454 
455 #define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \
456 inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
457 { \
458  auto v0 = vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \
459  auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/2); \
460  auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/2); \
461  auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/2); \
462  auto sh1 = vslide1up(v_trunc(vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \
463  auto vid = vor(sh1, v_trunc(vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \
464  auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
465  return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
466 }
467 OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m2, m4, OPENCV_HAL_NOP)
468 OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m1, m2, OPENCV_HAL_NOP)
469 OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, mf2, m1, OPENCV_HAL_NOP)
470 OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, mf2, m1, OPENCV_HAL_NOP)
471 OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, mf2, m1, vlmul_trunc_u32mf2)
472 #if CV_SIMD_SCALABLE_64F
473 OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, mf2, m1, vlmul_trunc_u32mf2)
474 #endif
475 
476 
477 #define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \
478 inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
479 { \
480  auto v0 = vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \
481  auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/4); \
482  auto v2 = vadd(v0, 2, VTraits<_Tpvec>::vlanes()/4); \
483  auto v3 = vadd(v0, 3, VTraits<_Tpvec>::vlanes()/4); \
484  auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/4); \
485  auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/4); \
486  auto w2 = vwcvtu_x(v2, VTraits<_Tpvec>::vlanes()/4); \
487  auto w3 = vwcvtu_x(v3, VTraits<_Tpvec>::vlanes()/4); \
488  auto sh2 = vslide1up(vreinterpret_u32##suffix1(w2),0, VTraits<_Tpvec>::vlanes()/2); \
489  auto sh3 = vslide1up(vreinterpret_u32##suffix1(w3),0, VTraits<_Tpvec>::vlanes()/2); \
490  auto vid0 = vor(sh2, vreinterpret_u32##suffix1(w0), VTraits<_Tpvec>::vlanes()/2); \
491  auto vid1 = vor(sh3, vreinterpret_u32##suffix1(w1), VTraits<_Tpvec>::vlanes()/2); \
492  auto wid0 = vwcvtu_x(v_trunc(vid0), VTraits<_Tpvec>::vlanes()/2); \
493  auto wid1 = vwcvtu_x(v_trunc(vid1), VTraits<_Tpvec>::vlanes()/2); \
494  auto shwid1 = vslide1up(vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \
495  auto vid = vor(shwid1, vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \
496  auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
497  return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
498 }
499 OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m1, m2, m4, OPENCV_HAL_NOP)
500 OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, mf2 , m1, m2, OPENCV_HAL_NOP)
501 OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, mf2, m1, m1, vlmul_trunc_u32mf2)
502 OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, mf2, m1, m1, vlmul_trunc_u32mf2)
503 
504 #define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
505 inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
506 { \
507  v_uint32 vidx_ = vmul(vreinterpret_u32m1(vidx), sizeof(_Tp), VTraits<v_int32>::vlanes()); \
508  return vloxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \
509 }
510 OPENCV_HAL_IMPL_RVV_LUT_VEC(v_float32, float)
511 OPENCV_HAL_IMPL_RVV_LUT_VEC(v_int32, int)
512 OPENCV_HAL_IMPL_RVV_LUT_VEC(v_uint32, unsigned)
513 
514 #if CV_SIMD_SCALABLE_64F
515 inline v_float64 v_lut(const double* tab, const v_int32& vidx) \
516 { \
517  vuint32mf2_t vidx_ = vmul(vlmul_trunc_u32mf2(vreinterpret_u32m1(vidx)), sizeof(double), VTraits<v_float64>::vlanes()); \
518  return vloxei32(tab, vidx_, VTraits<v_float64>::vlanes()); \
519 }
520 #endif
521 
522 
523 inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
524 inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
525 inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
526 inline v_uint16 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
527 inline v_uint16 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
528 inline v_uint16 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
529 inline v_uint32 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
530 inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
531 inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
532 inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
533 inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
534 
536 inline v_uint8 v_pack_b(const v_uint16& a, const v_uint16& b)
537 {
538  return vnsrl(vset(vlmul_ext_v_u16m1_u16m2(a),1,b), 0, VTraits<v_uint8>::vlanes());
539 }
540 
541 inline v_uint8 v_pack_b(const v_uint32& a, const v_uint32& b,
542  const v_uint32& c, const v_uint32& d)
543 {
544 
545  return vnsrl(vnsrl(vset(vset(vset(vlmul_ext_u32m4(a),1,b),2,c),3,d), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
546 }
547 
548 inline v_uint8 v_pack_b(const v_uint64& a, const v_uint64& b, const v_uint64& c,
549  const v_uint64& d, const v_uint64& e, const v_uint64& f,
550  const v_uint64& g, const v_uint64& h)
551 {
552  return vnsrl(vnsrl(vnsrl(
553  vset(vset(vset(vset(vset(vset(vset(vlmul_ext_u64m8(a),
554  1,b),2,c),3,d),4,e),5,f),6,g),7,h),
555  0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
556 }
557 
559 #define OPENCV_HAL_IMPL_RVV_BIN_OP(_Tpvec, ocv_intrin, rvv_intrin) \
560 inline _Tpvec v_##ocv_intrin(const _Tpvec& a, const _Tpvec& b) \
561 { \
562  return rvv_intrin(a, b, VTraits<_Tpvec>::vlanes()); \
563 }
564 
565 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add, vsaddu)
566 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub, vssubu)
567 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add, vsadd)
568 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub, vssub)
569 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, vsaddu)
570 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, vssubu)
571 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, vsadd)
572 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, vssub)
573 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, vadd)
574 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, vsub)
575 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, vmul)
576 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, add, vadd)
577 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, sub, vsub)
578 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, mul, vmul)
579 OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, add, vfadd)
580 OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, sub, vfsub)
581 OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, mul, vfmul)
582 OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, div, vfdiv)
583 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, add, vadd)
584 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, sub, vsub)
585 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, add, vadd)
586 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, sub, vsub)
587 
588 #if CV_SIMD_SCALABLE_64F
589 OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, add, vfadd)
590 OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, sub, vfsub)
591 OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, mul, vfmul)
592 OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, div, vfdiv)
593 #endif
594 
595 #define OPENCV_HAL_IMPL_RVV_BIN_MADD(_Tpvec, rvv_add) \
596 template<typename... Args> \
597 inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
598  return v_add(rvv_add(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
599 }
600 #define OPENCV_HAL_IMPL_RVV_BIN_MMUL(_Tpvec, rvv_mul) \
601 template<typename... Args> \
602 inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
603  return v_mul(rvv_mul(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
604 }
605 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint8, vsaddu)
606 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int8, vsadd)
607 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint16, vsaddu)
608 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int16, vsadd)
609 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint32, vadd)
610 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int32, vadd)
611 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float32, vfadd)
612 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint64, vadd)
613 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, vadd)
614 
615 OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, vmul)
616 OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, vmul)
617 OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, vfmul)
618 #if CV_SIMD_SCALABLE_64F
619 OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, vfadd)
620 OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, vfmul)
621 #endif
622 
623 #define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _TpwvecM2, suffix, wmul) \
624 inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
625 { \
626  _TpwvecM2 temp = wmul(a, b, VTraits<_Tpvec>::vlanes()); \
627  c = vget_##suffix##m1(temp, 0); \
628  d = vget_##suffix##m1(temp, 1); \
629 }
630 
631 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8, v_uint16, vuint16m2_t, u16, vwmulu)
632 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8, v_int16, vint16m2_t, i16, vwmul)
633 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16, v_uint32, vuint32m2_t, u32, vwmulu)
634 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16, v_int32, vint32m2_t, i32, vwmul)
635 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32, v_uint64, vuint64m2_t, u64, vwmulu)
636 
637 inline v_int16 v_mul_hi(const v_int16& a, const v_int16& b)
638 {
639  return vmulh(a, b, VTraits<v_int16>::vlanes());
640 }
641 inline v_uint16 v_mul_hi(const v_uint16& a, const v_uint16& b)
642 {
643  return vmulhu(a, b, VTraits<v_uint16>::vlanes());
644 }
645 
647 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add_wrap, vadd)
648 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add_wrap, vadd)
649 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add_wrap, vadd)
650 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add_wrap, vadd)
651 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub_wrap, vsub)
652 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub_wrap, vsub)
653 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub_wrap, vsub)
654 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub_wrap, vsub)
655 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, mul_wrap, vmul)
656 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, mul_wrap, vmul)
657 OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, mul_wrap, vmul)
658 OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, mul_wrap, vmul)
659 
660 #define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _clip, _wmul) \
662 inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
663 { \
664  return _clip(_wmul(a, b, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()); \
665 } \
666 template<typename... Args> \
667 inline _Tpvec v_mul(const _Tpvec& a1, const _Tpvec& a2, const Args&... va) { \
668  return v_mul(_clip(_wmul(a1, a2, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()), va...); \
669 }
670 
671 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8, vnclipu, vwmulu)
672 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8, vnclip, vwmul)
673 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16, vnclipu, vwmulu)
674 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16, vnclip, vwmul)
675 
676 
678 #define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, vl) \
679 inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
680 { \
681  return vand(a, b, vl); \
682 } \
683 inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
684 { \
685  return vor(a, b, vl); \
686 } \
687 inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
688 { \
689  return vxor(a, b, vl); \
690 } \
691 inline _Tpvec v_not (const _Tpvec& a) \
692 { \
693  return vnot(a, vl); \
694 }
695 
696 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8, VTraits<v_uint8>::vlanes())
697 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8, VTraits<v_int8>::vlanes())
698 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16, VTraits<v_uint16>::vlanes())
699 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16, VTraits<v_int16>::vlanes())
700 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32, VTraits<v_uint32>::vlanes())
701 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits<v_int32>::vlanes())
702 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits<v_uint64>::vlanes())
703 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits<v_int64>::vlanes())
704 
705 #define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(intrin) \
706 inline v_float32 intrin (const v_float32& a, const v_float32& b) \
707 { \
708  return vreinterpret_f32m1(intrin(vreinterpret_i32m1(a), vreinterpret_i32m1(b))); \
709 }
710 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_and)
711 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_or)
712 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_xor)
713 
714 inline v_float32 v_not (const v_float32& a) \
715 { \
716  return vreinterpret_f32m1(v_not(vreinterpret_i32m1(a))); \
717 }
718 
719 #if CV_SIMD_SCALABLE_64F
720 #define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(intrin) \
721 inline v_float64 intrin (const v_float64& a, const v_float64& b) \
722 { \
723  return vreinterpret_f64m1(intrin(vreinterpret_i64m1(a), vreinterpret_i64m1(b))); \
724 }
725 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_and)
726 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_or)
727 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_xor)
728 
729 inline v_float64 v_not (const v_float64& a) \
730 { \
731  return vreinterpret_f64m1(v_not(vreinterpret_i64m1(a))); \
732 }
733 #endif
734 
735 
737 /* Usage
738 1. v_shl<N>(vec);
739 2. v_shl(vec, N); // instead of vec << N, when N is non-constant.
740 */
741 
742 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \
743 template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
744 { \
745  return _Tpvec(vsll(a, uint8_t(n), vl)); \
746 } \
747 template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
748 { \
749  return _Tpvec(vsrl(a, uint8_t(n), vl)); \
750 }
751 
752 #define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \
753 template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
754 { \
755  return _Tpvec(vsll(a, uint8_t(n), vl)); \
756 } \
757 template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
758 { \
759  return _Tpvec(vsra(a, uint8_t(n), vl)); \
760 }
761 
762 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16, VTraits<v_uint16>::vlanes())
763 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32, VTraits<v_uint32>::vlanes())
764 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64, VTraits<v_uint64>::vlanes())
765 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16, VTraits<v_int16>::vlanes())
766 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32, VTraits<v_int32>::vlanes())
767 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64, VTraits<v_int64>::vlanes())
768 
770 #define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix) \
771 inline _Tpvec v_##op(const _Tpvec& a, const _Tpvec& b) \
772 { \
773  size_t VLEN = VTraits<_Tpvec>::vlanes(); \
774  uint64_t ones = -1; \
775  return vmerge(intrin(a, b, VLEN), vmv_v_x_##suffix##m1(0, VLEN), ones, VLEN); \
776 }
777 
778 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix) \
779 inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \
780 { \
781  size_t VLEN = VTraits<_Tpvec>::vlanes(); \
782  union { uint64_t u; VTraits<_Tpvec>::lane_type d; } ones; \
783  ones.u = -1; \
784  auto diff = intrin(a, b, VLEN); \
785  auto z = vfmv_v_f_##suffix##m1(0, VLEN); \
786  auto res = vfmerge(diff, z, ones.d, VLEN); \
787  return _Tpvec(res); \
788 } //TODO
789 
790 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix) \
791 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix) \
792 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix) \
793 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmsltu, suffix) \
794 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgtu, suffix) \
795 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsleu, suffix) \
796 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsgeu, suffix)
797 
798 #define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix) \
799 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix) \
800 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix) \
801 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmslt, suffix) \
802 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgt, suffix) \
803 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsle, suffix) \
804 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsge, suffix)
805 
806 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix) \
807 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, eq, vmfeq, suffix) \
808 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ne, vmfne, suffix) \
809 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, lt, vmflt, suffix) \
810 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, gt, vmfgt, suffix) \
811 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, le, vmfle, suffix) \
812 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ge, vmfge, suffix)
813 
814 
815 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8, u8)
816 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16, u16)
817 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32, u32)
818 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64, u64)
819 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8, i8)
820 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16, i16)
821 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32, i32)
822 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64, i64)
823 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32, f32)
824 #if CV_SIMD_SCALABLE_64F
825 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64, f64)
826 #endif
827 
828 inline v_float32 v_not_nan(const v_float32& a)
829 { return v_eq(a, a); }
830 
831 #if CV_SIMD_SCALABLE_64F
832 inline v_float64 v_not_nan(const v_float64& a)
833 { return v_eq(a, a); }
834 #endif
835 
837 
838 #define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
839 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
840 { \
841  return intrin(a, b, vl); \
842 }
843 
844 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_min, vminu, VTraits<v_uint8>::vlanes())
845 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_max, vmaxu, VTraits<v_uint8>::vlanes())
846 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_min, vmin, VTraits<v_int8>::vlanes())
847 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_max, vmax, VTraits<v_int8>::vlanes())
848 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_min, vminu, VTraits<v_uint16>::vlanes())
849 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_max, vmaxu, VTraits<v_uint16>::vlanes())
850 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_min, vmin, VTraits<v_int16>::vlanes())
851 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_max, vmax, VTraits<v_int16>::vlanes())
852 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, vminu, VTraits<v_uint32>::vlanes())
853 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, vmaxu, VTraits<v_uint32>::vlanes())
854 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits<v_int32>::vlanes())
855 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits<v_int32>::vlanes())
856 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits<v_float32>::vlanes())
857 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits<v_float32>::vlanes())
858 #if CV_SIMD_SCALABLE_64F
859 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits<v_float64>::vlanes())
860 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits<v_float64>::vlanes())
861 #endif
862 
864 #define OPENCV_HAL_IMPL_RVV_ZIP4(_Tpvec, _wTpvec, suffix, convert2u, convert) \
865 inline void v_zip4(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
866  int vl = 4; \
867  _wTpvec temp = vreinterpret_##suffix##m2(convert2u( \
868  vor(vzext_vf2(convert(a0), vl), \
869  vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(convert(a1), vl)), 0, vl*2)), \
870  vl))); \
871  b0 = vget_##suffix##m1(temp, 0); \
872  b1 = vget_##suffix##m1(vrgather(temp, vadd(vid_v_u32m2(vl), 4, vl)/*{4,5,6,7} */, vl) ,0); \
873 }
874 
875 OPENCV_HAL_IMPL_RVV_ZIP4(v_uint32, vuint32m2_t, u32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
876 OPENCV_HAL_IMPL_RVV_ZIP4(v_int32, vint32m2_t, i32, vreinterpret_u32m2, vreinterpret_u32m1)
877 OPENCV_HAL_IMPL_RVV_ZIP4(v_float32, vfloat32m2_t, f32, vreinterpret_u32m2, vreinterpret_u32m1)
878 
879 #if 0
880 // this is v_zip4 and v_tranpose4x4 for scalable VLEN, costs more instruction than current 128-bit only version.
881 inline void v_zip4(const v_float32& a0, const v_float32& a1, v_float32& b0, v_float32& b1) {
882  vuint64m1_t vid1 = vid_v_u64m1(VTraits<vuint64m1_t>::vlanes());
883  vuint16m1_t t1 = vreinterpret_u16m1(vid1);
884  vuint16m1_t t2 = vslide1up(t1, 0, VTraits<vuint16m1_t>::vlanes());
885  vuint16m1_t t3 = vslide1up(t2, 0, VTraits<vuint16m1_t>::vlanes());
886  vuint16m1_t t4 = vslide1up(t3, 0, VTraits<vuint16m1_t>::vlanes());
887  t1 = vor(
888  vor(t1, t2, VTraits<vuint16m1_t>::vlanes()),
889  vor(t3, t4, VTraits<vuint16m1_t>::vlanes()),
890  VTraits<vuint16m1_t>::vlanes()
891  );
892  vuint32m2_t vidx0 = vwmulu(t1, 4, VTraits<vuint32m1_t>::vlanes());
893  vidx0 = vadd(vidx0, vid_v_u32m2(VTraits<vuint32m1_t>::vlanes()), VTraits<vuint32m1_t>::vlanes());
894  vuint32m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint32m1_t>::vlanes());
895  vfloat32m2_t temp = vreinterpret_f32m2(vreinterpret_u32m2(
896  vor(vzext_vf2(vreinterpret_u32m1(a0), VTraits<vuint16m1_t>::vlanes()),
897  vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a1), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vfloat32m1_t>::vlanes()*2)),
898  VTraits<vfloat32m1_t>::vlanes())));
899  b0 = vlmul_trunc_f32m1(vrgather(temp, vidx0, VTraits<vuint16m1_t>::vlanes()));
900  b1 = vlmul_trunc_f32m1(vrgather(temp, vidx1, VTraits<vuint16m1_t>::vlanes()));
901 }
902 
903 inline void v_transpose4x4(const v_float32& a0, const v_float32& a1, const v_float32& a2, const v_float32& a3,\
904  v_float32& b0, v_float32& b1, v_float32& b2, v_float32& b3) { \
905  vuint64m2_t vid1 = vid_v_u64m2(VTraits<vuint32m1_t>::vlanes());
906  vuint16m2_t t1 = vreinterpret_u16m2(vid1);
907  vuint16m2_t t2 = vslide1up(t1, 0, VTraits<vuint8m1_t>::vlanes());
908  vuint16m2_t t3 = vslide1up(t2, 0, VTraits<vuint8m1_t>::vlanes());
909  vuint16m2_t t4 = vslide1up(t3, 0, VTraits<vuint8m1_t>::vlanes());
910  t1 = vor(
911  vor(t1, t2, VTraits<vuint8m1_t>::vlanes()),
912  vor(t3, t4, VTraits<vuint8m1_t>::vlanes()),
913  VTraits<vuint8m1_t>::vlanes()
914  );
915  vuint16m2_t vidx0 = vmul(t1, 12, VTraits<vuint8m1_t>::vlanes());
916  vidx0 = vadd(vidx0, vid_v_u16m2(VTraits<vuint8m1_t>::vlanes()), VTraits<vuint8m1_t>::vlanes());
917  vuint16m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint8m1_t>::vlanes());
918  vuint16m2_t vidx2 = vadd(vidx0, 8, VTraits<vuint8m1_t>::vlanes());
919  vuint16m2_t vidx3 = vadd(vidx0, 12, VTraits<vuint8m1_t>::vlanes());
920  vuint32m2_t tempA = vreinterpret_u32m2( \
921  vor(vzext_vf2(vreinterpret_u32m1(a0), VTraits<vuint16m1_t>::vlanes()), \
922  vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a2), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
923  VTraits<vuint32m1_t>::vlanes())); \
924  vuint32m2_t tempB = vreinterpret_u32m2( \
925  vor(vzext_vf2(vreinterpret_u32m1(a1), VTraits<vuint16m1_t>::vlanes()), \
926  vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a3), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
927  VTraits<vuint32m1_t>::vlanes())); \
928  vfloat32m4_t temp = vreinterpret_f32m4(vreinterpret_u32m4( \
929  vor(vzext_vf2(tempA, VTraits<vuint8m1_t>::vlanes()), \
930  vreinterpret_u64m4(vslide1up(vreinterpret_u32m4(vzext_vf2(tempB, VTraits<vuint8m1_t>::vlanes())), 0, VTraits<vuint8m1_t>::vlanes())), \
931  VTraits<vuint16m1_t>::vlanes()))); \
932  b0 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx0, VTraits<vuint8m1_t>::vlanes()));
933  b1 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx1, VTraits<vuint8m1_t>::vlanes()));
934  b2 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx2, VTraits<vuint8m1_t>::vlanes()));
935  b3 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx3, VTraits<vuint8m1_t>::vlanes()));
936 }
937 #endif
938 
939 #define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, suffix) \
940 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, const _Tpvec& a2, const _Tpvec& a3, _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) { \
941  _Tpvec t0,t1,t2,t3; \
942  v_zip4(a0, a2, t0, t2); \
943  v_zip4(a1, a3, t1, t3); \
944  v_zip4(t0, t1, b0, b1); \
945  v_zip4(t2, t3, b2, b3); \
946 }
947 
948 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_uint32, u32)
949 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_int32, i32)
950 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_float32, f32)
951 
952 
954 #define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl, red) \
955 inline scalartype v_reduce_sum(const _Tpvec& a) \
956 { \
957  _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
958  _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
959  res = v##red(res, a, zero, vl); \
960  return (scalartype)v_get0(res); \
961 }
962 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8, v_uint16, vuint16m1_t, unsigned, u16, VTraits<v_uint8>::vlanes(), wredsumu)
963 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8, v_int16, vint16m1_t, int, i16, VTraits<v_int8>::vlanes(), wredsum)
964 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16, v_uint32, vuint32m1_t, unsigned, u32, VTraits<v_uint16>::vlanes(), wredsumu)
965 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16, v_int32, vint32m1_t, int, i32, VTraits<v_int16>::vlanes(), wredsum)
966 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32, v_uint64, vuint64m1_t, unsigned, u64, VTraits<v_uint32>::vlanes(), wredsumu)
967 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32, v_int64, vint64m1_t, int, i64, VTraits<v_int32>::vlanes(), wredsum)
968 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64, v_uint64, vuint64m1_t, uint64, u64, VTraits<v_uint64>::vlanes(), redsum)
969 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64, v_int64, vint64m1_t, int64, i64, VTraits<v_int64>::vlanes(), redsum)
970 
971 
972 #define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl) \
973 inline scalartype v_reduce_sum(const _Tpvec& a) \
974 { \
975  _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
976  _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
977  res = vfredosum(res, a, zero, vl); \
978  return (scalartype)v_get0(res); \
979 }
980 OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
981 #if CV_SIMD_SCALABLE_64F
982 OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits<v_float64>::vlanes())
983 #endif
984 
985 #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
986 inline scalartype v_reduce_##func(const _Tpvec& a) \
987 { \
988  _Tpvec res = _Tpvec(v##red(a, a, a, vl)); \
989  return (scalartype)v_get0(res); \
990 }
991 
992 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, min, uchar, u8, VTraits<v_uint8>::vlanes(), redminu)
993 OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, min, schar, i8, VTraits<v_int8>::vlanes(), redmin)
994 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, min, ushort, u16, VTraits<v_uint16>::vlanes(), redminu)
995 OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, min, short, i16, VTraits<v_int16>::vlanes(), redmin)
996 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, min, unsigned, u32, VTraits<v_uint32>::vlanes(), redminu)
997 OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, min, int, i32, VTraits<v_int32>::vlanes(), redmin)
998 OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, min, float, f32, VTraits<v_float32>::vlanes(), fredmin)
999 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, max, uchar, u8, VTraits<v_uint8>::vlanes(), redmaxu)
1000 OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, max, schar, i8, VTraits<v_int8>::vlanes(), redmax)
1001 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, max, ushort, u16, VTraits<v_uint16>::vlanes(), redmaxu)
1002 OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, max, short, i16, VTraits<v_int16>::vlanes(), redmax)
1003 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, max, unsigned, u32, VTraits<v_uint32>::vlanes(), redmaxu)
1004 OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, max, int, i32, VTraits<v_int32>::vlanes(), redmax)
1005 OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, max, float, f32, VTraits<v_float32>::vlanes(), fredmax)
1006 
1007 inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b,
1008  const v_float32& c, const v_float32& d)
1009 {
1010  // 0000 1111 2222 3333 ....
1011  vuint64m2_t vid1 = vid_v_u64m2(VTraits<vuint32m1_t>::vlanes());
1012  vuint16m2_t t1 = vreinterpret_u16m2(vid1);
1013  vuint16m2_t t2 = vslide1up(t1, 0, VTraits<vuint8m1_t>::vlanes());
1014  vuint16m2_t t3 = vslide1up(t2, 0, VTraits<vuint8m1_t>::vlanes());
1015  vuint16m2_t t4 = vslide1up(t3, 0, VTraits<vuint8m1_t>::vlanes());
1016  t1 = vor(
1017  vor(t1, t2, VTraits<vuint8m1_t>::vlanes()),
1018  vor(t3, t4, VTraits<vuint8m1_t>::vlanes()),
1019  VTraits<vuint8m1_t>::vlanes()
1020  );
1021 
1022  // index for transpose4X4
1023  vuint16m2_t vidx0 = vmul(t1, 12, VTraits<vuint8m1_t>::vlanes());
1024  vidx0 = vadd(vidx0, vid_v_u16m2(VTraits<vuint8m1_t>::vlanes()), VTraits<vuint8m1_t>::vlanes());
1025  vuint16m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint8m1_t>::vlanes());
1026  vuint16m2_t vidx2 = vadd(vidx0, 8, VTraits<vuint8m1_t>::vlanes());
1027  vuint16m2_t vidx3 = vadd(vidx0, 12, VTraits<vuint8m1_t>::vlanes());
1028 
1029  // zip
1030  vuint32m2_t tempA = vreinterpret_u32m2( \
1031  vor(vzext_vf2(vreinterpret_u32m1(a), VTraits<vuint16m1_t>::vlanes()), \
1032  vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(c), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
1033  VTraits<vuint32m1_t>::vlanes())); \
1034  vuint32m2_t tempB = vreinterpret_u32m2( \
1035  vor(vzext_vf2(vreinterpret_u32m1(b), VTraits<vuint16m1_t>::vlanes()), \
1036  vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(d), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
1037  VTraits<vuint32m1_t>::vlanes())); \
1038  vfloat32m4_t temp = vreinterpret_f32m4(vreinterpret_u32m4( \
1039  vor(vzext_vf2(tempA, VTraits<vuint8m1_t>::vlanes()), \
1040  vreinterpret_u64m4(vslide1up(vreinterpret_u32m4(vzext_vf2(tempB, VTraits<vuint8m1_t>::vlanes())), 0, VTraits<vuint8m1_t>::vlanes())), \
1041  VTraits<vuint16m1_t>::vlanes())));
1042 
1043  // transpose
1044  vfloat32m1_t b0 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx0, VTraits<vuint8m1_t>::vlanes()));
1045  vfloat32m1_t b1 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx1, VTraits<vuint8m1_t>::vlanes()));
1046  vfloat32m1_t b2 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx2, VTraits<vuint8m1_t>::vlanes()));
1047  vfloat32m1_t b3 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx3, VTraits<vuint8m1_t>::vlanes()));
1048 
1049  // vector add
1050  v_float32 res = vfadd(
1051  vfadd(b0, b1, VTraits<vfloat32m1_t>::vlanes()),
1052  vfadd(b2, b3, VTraits<vfloat32m1_t>::vlanes()),
1053  VTraits<vfloat32m1_t>::vlanes()
1054  );
1055  return res;
1056 }
1057 
1059 
1060 inline v_float32 v_sqrt(const v_float32& x)
1061 {
1062  return vfsqrt(x, VTraits<v_float32>::vlanes());
1063 }
1064 
1065 inline v_float32 v_invsqrt(const v_float32& x)
1066 {
1067  v_float32 one = v_setall_f32(1.0f);
1068  return v_div(one, v_sqrt(x));
1069 }
1070 
1071 #if CV_SIMD_SCALABLE_64F
1072 inline v_float64 v_sqrt(const v_float64& x)
1073 {
1074  return vfsqrt(x, VTraits<v_float64>::vlanes());
1075 }
1076 
1077 inline v_float64 v_invsqrt(const v_float64& x)
1078 {
1079  v_float64 one = v_setall_f64(1.0f);
1080  return v_div(one, v_sqrt(x));
1081 }
1082 #endif
1083 
1084 inline v_float32 v_magnitude(const v_float32& a, const v_float32& b)
1085 {
1086  v_float32 x = vfmacc(vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes());
1087  return v_sqrt(x);
1088 }
1089 
1090 inline v_float32 v_sqr_magnitude(const v_float32& a, const v_float32& b)
1091 {
1092  return v_float32(vfmacc(vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes()));
1093 }
1094 
1095 #if CV_SIMD_SCALABLE_64F
1096 inline v_float64 v_magnitude(const v_float64& a, const v_float64& b)
1097 {
1098  v_float64 x = vfmacc(vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
1099  return v_sqrt(x);
1100 }
1101 
1102 inline v_float64 v_sqr_magnitude(const v_float64& a, const v_float64& b)
1103 {
1104  return vfmacc(vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
1105 }
1106 #endif
1107 
1109 
1110 inline v_float32 v_fma(const v_float32& a, const v_float32& b, const v_float32& c)
1111 {
1112  return vfmacc(c, a, b, VTraits<v_float32>::vlanes());
1113 }
1114 inline v_int32 v_fma(const v_int32& a, const v_int32& b, const v_int32& c)
1115 {
1116  return vmacc(c, a, b, VTraits<v_float32>::vlanes());
1117 }
1118 
1119 inline v_float32 v_muladd(const v_float32& a, const v_float32& b, const v_float32& c)
1120 {
1121  return v_fma(a, b, c);
1122 }
1123 
1124 inline v_int32 v_muladd(const v_int32& a, const v_int32& b, const v_int32& c)
1125 {
1126  return v_fma(a, b, c);
1127 }
1128 
1129 #if CV_SIMD_SCALABLE_64F
1130 inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c)
1131 {
1132  return vfmacc_vv_f64m1(c, a, b, VTraits<v_float64>::vlanes());
1133 }
1134 
1135 inline v_float64 v_muladd(const v_float64& a, const v_float64& b, const v_float64& c)
1136 {
1137  return v_fma(a, b, c);
1138 }
1139 #endif
1140 
1142 
1143 #define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
1144 inline bool v_check_all(const _Tpvec& a) \
1145 { \
1146  return (int)vcpop(vmslt(a, 0, vl), vl) == vl; \
1147 } \
1148 inline bool v_check_any(const _Tpvec& a) \
1149 { \
1150  return (int)vcpop(vmslt(a, 0, vl), vl) != 0; \
1151 }
1152 
1153 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8, VTraits<v_int8>::vlanes())
1154 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16, VTraits<v_int16>::vlanes())
1155 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32, VTraits<v_int32>::vlanes())
1156 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64, VTraits<v_int64>::vlanes())
1157 
1158 
1159 inline bool v_check_all(const v_uint8& a)
1160 { return v_check_all(v_reinterpret_as_s8(a)); }
1161 inline bool v_check_any(const v_uint8& a)
1162 { return v_check_any(v_reinterpret_as_s8(a)); }
1163 
1164 inline bool v_check_all(const v_uint16& a)
1165 { return v_check_all(v_reinterpret_as_s16(a)); }
1166 inline bool v_check_any(const v_uint16& a)
1167 { return v_check_any(v_reinterpret_as_s16(a)); }
1168 
1169 inline bool v_check_all(const v_uint32& a)
1170 { return v_check_all(v_reinterpret_as_s32(a)); }
1171 inline bool v_check_any(const v_uint32& a)
1172 { return v_check_any(v_reinterpret_as_s32(a)); }
1173 
1174 inline bool v_check_all(const v_float32& a)
1175 { return v_check_all(v_reinterpret_as_s32(a)); }
1176 inline bool v_check_any(const v_float32& a)
1177 { return v_check_any(v_reinterpret_as_s32(a)); }
1178 
1179 inline bool v_check_all(const v_uint64& a)
1180 { return v_check_all(v_reinterpret_as_s64(a)); }
1181 inline bool v_check_any(const v_uint64& a)
1182 { return v_check_any(v_reinterpret_as_s64(a)); }
1183 
1184 #if CV_SIMD_SCALABLE_64F
1185 inline bool v_check_all(const v_float64& a)
1186 { return v_check_all(v_reinterpret_as_s64(a)); }
1187 inline bool v_check_any(const v_float64& a)
1188 { return v_check_any(v_reinterpret_as_s64(a)); }
1189 #endif
1190 
1192 
1193 #define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
1194 inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
1195 { \
1196  return v_sub(v_max(a, b), v_min(a, b)); \
1197 }
1198 
1199 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8, absdiff)
1200 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16, absdiff)
1201 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32, absdiff)
1202 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32, absdiff)
1203 #if CV_SIMD_SCALABLE_64F
1204 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64, absdiff)
1205 #endif
1206 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8, absdiffs)
1207 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16, absdiffs)
1208 
1209 #define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, width) \
1210 inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1211 { \
1212  return vnclipu(vreinterpret_u##width##m2(vwsub_vv(v_max(a, b), v_min(a, b), VTraits<_Tpvec>::vlanes())), 0, VTraits<_Tpvec>::vlanes()); \
1213 }
1214 
1215 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8, v_uint8, 16)
1216 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16, v_uint16, 32)
1217 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32, v_uint32, 64)
1218 
1219 #define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
1220 inline _Tprvec v_abs(const _Tpvec& a) \
1221 { \
1222  return v_absdiff(a, v_setzero_##suffix()); \
1223 }
1224 
1225 OPENCV_HAL_IMPL_RVV_ABS(v_uint8, v_int8, s8)
1226 OPENCV_HAL_IMPL_RVV_ABS(v_uint16, v_int16, s16)
1227 OPENCV_HAL_IMPL_RVV_ABS(v_uint32, v_int32, s32)
1228 OPENCV_HAL_IMPL_RVV_ABS(v_float32, v_float32, f32)
1229 #if CV_SIMD_SCALABLE_64F
1230 OPENCV_HAL_IMPL_RVV_ABS(v_float64, v_float64, f64)
1231 #endif
1232 
1233 
1234 #define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
1235 inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
1236 { \
1237  return v_reduce_sum(v_absdiff(a, b)); \
1238 }
1239 
1240 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8, unsigned)
1241 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8, unsigned)
1242 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16, unsigned)
1243 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16, unsigned)
1244 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32, unsigned)
1245 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32, unsigned)
1246 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32, float)
1247 
1248 
1250 #define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, vl) \
1251 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1252 { \
1253  return vmerge(vmsne(mask, 0, vl), b, a, vl); \
1254 }
1255 
1256 OPENCV_HAL_IMPL_RVV_SELECT(v_uint8, VTraits<v_uint8>::vlanes())
1257 OPENCV_HAL_IMPL_RVV_SELECT(v_uint16, VTraits<v_uint16>::vlanes())
1258 OPENCV_HAL_IMPL_RVV_SELECT(v_uint32, VTraits<v_uint32>::vlanes())
1259 OPENCV_HAL_IMPL_RVV_SELECT(v_int8, VTraits<v_int8>::vlanes())
1260 OPENCV_HAL_IMPL_RVV_SELECT(v_int16, VTraits<v_int16>::vlanes())
1261 OPENCV_HAL_IMPL_RVV_SELECT(v_int32, VTraits<v_int32>::vlanes())
1262 
1263 inline v_float32 v_select(const v_float32& mask, const v_float32& a, const v_float32& b) \
1264 { \
1265  return vmerge(vmfne(mask, 0, VTraits<v_float32>::vlanes()), b, a, VTraits<v_float32>::vlanes()); \
1266 }
1267 
1268 #if CV_SIMD_SCALABLE_64F
1269 inline v_float64 v_select(const v_float64& mask, const v_float64& a, const v_float64& b) \
1270 { \
1271  return vmerge(vmfne(mask, 0, VTraits<v_float64>::vlanes()), b, a, VTraits<v_float64>::vlanes()); \
1272 }
1273 #endif
1274 
1276 
1277 #define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
1278 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1279 { \
1280  return vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \
1281 } \
1282 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1283 { \
1284  return vslideup(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \
1285 } \
1286 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1287 { return a; } \
1288 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1289 { \
1290  return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
1291 } \
1292 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1293 { \
1294  return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
1295 } \
1296 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1297 { CV_UNUSED(b); return a; }
1298 
1299 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8, u8, VTraits<v_uint8>::vlanes())
1300 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8, i8, VTraits<v_int8>::vlanes())
1301 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16, u16, VTraits<v_uint16>::vlanes())
1302 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16, i16, VTraits<v_int16>::vlanes())
1303 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32, u32, VTraits<v_uint32>::vlanes())
1304 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32, i32, VTraits<v_int32>::vlanes())
1305 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64, u64, VTraits<v_uint64>::vlanes())
1306 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64, i64, VTraits<v_int64>::vlanes())
1307 
1308 #define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
1309 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1310 { \
1311  return vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \
1312 } \
1313 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1314 { \
1315  return vslideup(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \
1316 } \
1317 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1318 { return a; } \
1319 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1320 { \
1321  return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
1322 } \
1323 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1324 { \
1325  return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
1326 } \
1327 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1328 { CV_UNUSED(b); return a; }
1329 
1330 OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32, f32, VTraits<v_float32>::vlanes())
1331 #if CV_SIMD_SCALABLE_64F
1332 OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64, VTraits<v_float64>::vlanes())
1333 #endif
1334 
1336 inline v_float32 v_cvt_f32(const v_int32& a)
1337 {
1338  return vfcvt_f_x_v_f32m1(a, VTraits<v_float32>::vlanes());
1339 }
1340 
1341 #if CV_SIMD_SCALABLE_64F
1342 inline v_float32 v_cvt_f32(const v_float64& a)
1343 {
1344  return vfncvt_f(vlmul_ext_f64m2(a), VTraits<v_float64>::vlanes());
1345 }
1346 
1347 inline v_float32 v_cvt_f32(const v_float64& a, const v_float64& b)
1348 {
1349  return vfncvt_f(vset(vlmul_ext_f64m2(a),1,b), VTraits<v_float32>::vlanes());
1350 }
1351 
1352 inline v_float64 v_cvt_f64(const v_int32& a)
1353 {
1354  return vget_f64m1(vfwcvt_f(a, VTraits<v_int32>::vlanes()), 0);
1355 }
1356 
1357 inline v_float64 v_cvt_f64_high(const v_int32& a)
1358 {
1359  return vget_f64m1(vfwcvt_f(a, VTraits<v_int32>::vlanes()), 1);
1360 }
1361 
1362 inline v_float64 v_cvt_f64(const v_float32& a)
1363 {
1364  return vget_f64m1(vfwcvt_f(a, VTraits<v_float32>::vlanes()), 0);
1365 }
1366 
1367 inline v_float64 v_cvt_f64_high(const v_float32& a)
1368 {
1369  return vget_f64m1(vfwcvt_f(a, VTraits<v_float32>::vlanes()), 1);
1370 }
1371 
1372 inline v_float64 v_cvt_f64(const v_int64& a)
1373 {
1374  return vfcvt_f(a, VTraits<v_int64>::vlanes());
1375 }
1376 #endif
1377 
1379 
1380 #define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
1381 template<int s = 0> inline _Tpvec v_broadcast_element(_Tpvec v, int i = s) \
1382 { \
1383  return v_setall_##suffix(v_extract_n(v, i)); \
1384 } \
1385 inline _Tpvec v_broadcast_highest(_Tpvec v) \
1386 { \
1387  return v_setall_##suffix(v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \
1388 }
1389 
1390 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32)
1391 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32)
1392 OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32)
1393 
1394 
1395 #define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, width) \
1397 inline _Tpvec v_reverse(const _Tpvec& a) \
1398 { \
1399  vuint##width##m1_t vidx = vrsub(vid_v_u##width##m1(VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()-1, VTraits<_Tpvec>::vlanes()); \
1400  return vrgather(a, vidx, VTraits<_Tpvec>::vlanes()); \
1401 }
1402 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8)
1403 OPENCV_HAL_IMPL_RVV_REVERSE(v_int8, 8)
1404 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16, 16)
1405 OPENCV_HAL_IMPL_RVV_REVERSE(v_int16, 16)
1406 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32, 32)
1407 OPENCV_HAL_IMPL_RVV_REVERSE(v_int32, 32)
1408 OPENCV_HAL_IMPL_RVV_REVERSE(v_float32, 32)
1409 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64, 64)
1410 OPENCV_HAL_IMPL_RVV_REVERSE(v_int64, 64)
1411 #if CV_SIMD_SCALABLE_64F
1412 OPENCV_HAL_IMPL_RVV_REVERSE(v_float64, 64)
1413 #endif
1414 
1416 
1417 #define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \
1418 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1419 { \
1420  _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
1421  b0 = vget_##suffix##m1(temp, 0); \
1422  b1 = vget_##suffix##m1(temp, 1); \
1423 } \
1424 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1425 { \
1426  _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
1427  return vget_##suffix##m1(temp, 0); \
1428 } \
1429 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1430 { \
1431  _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
1432  return vget_##suffix##m1(temp, 1); \
1433 } \
1434 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1435 { \
1436  return cvt(vle##width##_v_##suffix2##mf2(ptr, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
1437 }
1438 
1439 OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, vwcvtu_x)
1440 OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m2_t, v_int8, 8, i16, i8, vwcvt_x)
1441 OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m2_t, v_uint16, 16, u32, u16, vwcvtu_x)
1442 OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m2_t, v_int16, 16, i32, i16, vwcvt_x)
1443 OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m2_t, v_uint32, 32, u64, u32, vwcvtu_x)
1444 OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m2_t, v_int32, 32, i64, i32, vwcvt_x)
1445 
1446 inline v_uint32 v_load_expand_q(const uchar* ptr)
1447 {
1448  return vwcvtu_x(vwcvtu_x(vle8_v_u8mf4(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
1449 }
1450 
1451 inline v_int32 v_load_expand_q(const schar* ptr)
1452 {
1453  return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
1454 }
1455 
1456 #define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, hwidth, hsuffix, suffix, rshr, shr) \
1457 inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
1458 { \
1459  return shr(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, VTraits<_Tpvec>::vlanes()); \
1460 } \
1461 inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
1462 { \
1463  vse##hwidth##_v_##hsuffix##mf2(ptr, shr(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
1464 } \
1465 template<int n = 0> inline \
1466 _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b, int N = n) \
1467 { \
1468  return rshr(vset(vlmul_ext_##suffix##m2(a), 1, b), N, VTraits<_Tpvec>::vlanes()); \
1469 } \
1470 template<int n = 0> inline \
1471 void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a, int N = n) \
1472 { \
1473  vse##hwidth##_v_##hsuffix##mf2(ptr, rshr(a, N, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
1474 }
1475 
1476 OPENCV_HAL_IMPL_RVV_PACK(v_uint8, uchar, v_uint16, 8, u8, u16, vnclipu, vnclipu)
1477 OPENCV_HAL_IMPL_RVV_PACK(v_int8, schar, v_int16, 8, i8, i16, vnclip, vnclip)
1478 OPENCV_HAL_IMPL_RVV_PACK(v_uint16, ushort, v_uint32, 16, u16, u32, vnclipu, vnclipu)
1479 OPENCV_HAL_IMPL_RVV_PACK(v_int16, short, v_int32, 16, i16, i32, vnclip, vnclip)
1480 OPENCV_HAL_IMPL_RVV_PACK(v_uint32, unsigned, v_uint64, 32, u32, u64, vnclipu, vnsrl)
1481 OPENCV_HAL_IMPL_RVV_PACK(v_int32, int, v_int64, 32, i32, i64, vnclip, vnsra)
1482 
1483 #define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
1484 inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
1485 { \
1486  return vnclipu(cast(vmax(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, vl)), 0, vl); \
1487 } \
1488 inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
1489 { \
1490  vse##hwidth##_v_##hsuffix##mf2(ptr, vnclipu(vreinterpret_u##width##m1(vmax(a, 0, vl)), 0, vl), hvl); \
1491 } \
1492 template<int N = 0> inline \
1493 _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b, int n = N) \
1494 { \
1495  return vnclipu(cast(vmax(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, vl)), n, vl); \
1496 } \
1497 template<int N = 0> inline \
1498 void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a, int n = N) \
1499 { \
1500  vse##hwidth##_v_##hsuffix##mf2(ptr, vnclipu(vreinterpret_u##width##m1(vmax(a, 0, vl)), n, vl), hvl); \
1501 }
1502 
1503 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, VTraits<v_int16>::vlanes(), VTraits<v_uint8>::vlanes())
1504 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, VTraits<v_int32>::vlanes(), VTraits<v_uint16>::vlanes())
1505 
1506 
1507 /* void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
1508  a0 = {A1 A2 A3 A4}
1509  a1 = {B1 B2 B3 B4}
1510 ---------------
1511  {A1 B1 A2 B2} and {A3 B3 A4 B4}
1512 */
1513 
1514 #define OPENCV_HAL_IMPL_RVV_ZIP(_Tpvec, _wTpvec, suffix, width, width2, convert2um2, convert2um1) \
1515 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
1516  _wTpvec temp = vreinterpret_##suffix##m2(convert2um2( \
1517  vor(vzext_vf2(convert2um1(a0), VTraits<_Tpvec>::vlanes()*2), \
1518  vreinterpret_u##width2##m2(vslide1up(vreinterpret_u##width##m2(vzext_vf2(convert2um1(a1), VTraits<_Tpvec>::vlanes()*2)), 0, VTraits<_Tpvec>::vlanes()*2)), \
1519  VTraits<_Tpvec>::vlanes()))); \
1520  b0 = vget_##suffix##m1(temp, 0); \
1521  b1 = vget_##suffix##m1(temp, 1); \
1522 }
1523 OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m2_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1524 OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m2_t, i8, 8, 16, vreinterpret_u8m2, vreinterpret_u8m1)
1525 OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m2_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1526 OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m2_t, i16, 16, 32, vreinterpret_u16m2, vreinterpret_u16m1)
1527 OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1528 OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
1529 OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
1530 
1531 #if CV_SIMD_SCALABLE_64F
1532 inline void v_zip(const v_float64& a0, const v_float64& a1, v_float64& b0, v_float64& b1) { \
1533  vuint16mf4_t idx0 = vid_v_u16mf4(VTraits<v_float64>::vlanes());
1534  vuint16mf4_t idx1 = vadd(idx0, VTraits<v_float64>::vlanes(), VTraits<v_float64>::vlanes());
1535  vuint16mf2_t idx = vreinterpret_u16mf2(( \
1536  vor(vzext_vf2(idx0, VTraits<v_float64>::vlanes()), \
1537  vreinterpret_u32mf2(vslide1up(vreinterpret_u16mf2(vzext_vf2(idx1, VTraits<v_float64>::vlanes())), 0, VTraits<v_uint32>::vlanes())), \
1538  VTraits<v_uint32>::vlanes())));
1539 #if 0
1540  vfloat64m2_t temp = __riscv_vcreate_v_f64m1_f64m2(a0, a1);
1541 #else // TODO: clean up when RVV Intrinsic is frozen.
1542  vfloat64m2_t temp = vlmul_ext_f64m2(a0);
1543  temp = vset(temp, 1, a1);
1544 #endif
1545  temp = vrgatherei16(temp, idx, VTraits<v_float64>::vlanes()*2);
1546  b0 = vget_f64m1(temp, 0); \
1547  b1 = vget_f64m1(temp, 1); \
1548 }
1549 #endif
1550 
1551 #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \
1552 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1553 { \
1554  return vslideup(a, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes());\
1555 } \
1556 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1557 { \
1558  return vslideup( \
1559  vslidedown(a, a, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
1560  vslidedown(b, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
1561  VTraits<_Tpvec>::vlanes()/2, \
1562  VTraits<_Tpvec>::vlanes()); \
1563 } \
1564 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1565 { \
1566  c = v_combine_low(a, b); \
1567  d = v_combine_high(a, b); \
1568 }
1569 
1570 OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint8, 8)
1571 OPENCV_HAL_IMPL_RVV_UNPACKS(v_int8, 8)
1572 OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint16, 16)
1573 OPENCV_HAL_IMPL_RVV_UNPACKS(v_int16, 16)
1574 OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint32, 32)
1575 OPENCV_HAL_IMPL_RVV_UNPACKS(v_int32, 32)
1576 OPENCV_HAL_IMPL_RVV_UNPACKS(v_float32, 32)
1577 #if CV_SIMD_SCALABLE_64F
1578 OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64)
1579 #endif
1580 
1581 #define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width, hwidth, vl) \
1582 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1583 { \
1584  a = vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
1585  b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
1586 }\
1587 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1588 { \
1589  a = vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
1590  b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
1591  c = vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
1592 } \
1593 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1594  v_##_Tpvec& c, v_##_Tpvec& d) \
1595 { \
1596  \
1597  a = vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
1598  b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
1599  c = vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
1600  d = vlse##width##_v_##suffix##m1(ptr+3, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
1601 } \
1602 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1603  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1604 { \
1605  vsse##width(ptr, sizeof(_Tp)*2, a, VTraits<v_##_Tpvec>::vlanes()); \
1606  vsse##width(ptr+1, sizeof(_Tp)*2, b, VTraits<v_##_Tpvec>::vlanes()); \
1607 } \
1608 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1609  const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1610 { \
1611  vsse##width(ptr, sizeof(_Tp)*3, a, VTraits<v_##_Tpvec>::vlanes()); \
1612  vsse##width(ptr+1, sizeof(_Tp)*3, b, VTraits<v_##_Tpvec>::vlanes()); \
1613  vsse##width(ptr+2, sizeof(_Tp)*3, c, VTraits<v_##_Tpvec>::vlanes()); \
1614 } \
1615 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1616  const v_##_Tpvec& c, const v_##_Tpvec& d, \
1617  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
1618 { \
1619  vsse##width(ptr, sizeof(_Tp)*4, a, VTraits<v_##_Tpvec>::vlanes()); \
1620  vsse##width(ptr+1, sizeof(_Tp)*4, b, VTraits<v_##_Tpvec>::vlanes()); \
1621  vsse##width(ptr+2, sizeof(_Tp)*4, c, VTraits<v_##_Tpvec>::vlanes()); \
1622  vsse##width(ptr+3, sizeof(_Tp)*4, d, VTraits<v_##_Tpvec>::vlanes()); \
1623 }
1624 
1625 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, u8, 8, 4, VTraits<v_uint8>::vlanes())
1626 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, i8, 8, 4, VTraits<v_int8>::vlanes())
1627 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, u16, 16, 8, VTraits<v_uint16>::vlanes())
1628 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, i16, 16, 8, VTraits<v_int16>::vlanes())
1629 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, u32, 32, 16, VTraits<v_uint32>::vlanes())
1630 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, i32, 32, 16, VTraits<v_int32>::vlanes())
1631 OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, f32, 32, 16, VTraits<v_float32>::vlanes())
1632 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, u64, 64, 32, VTraits<v_uint64>::vlanes())
1633 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, i64, 64, 32, VTraits<v_int64>::vlanes())
1634 #if CV_SIMD_SCALABLE_64F
1635 OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, f64, 64, 32, VTraits<v_float64>::vlanes())
1636 #endif
1637 
1638 static uint64_t idx_interleave_pairs[] = { \
1639  0x0705060403010200, 0x0f0d0e0c0b090a08, 0x1715161413111210, 0x1f1d1e1c1b191a18, \
1640  0x2725262423212220, 0x2f2d2e2c2b292a28, 0x3735363433313230, 0x3f3d3e3c3b393a38, \
1641  0x4745464443414240, 0x4f4d4e4c4b494a48, 0x5755565453515250, 0x5f5d5e5c5b595a58, \
1642  0x6765666463616260, 0x6f6d6e6c6b696a68, 0x7775767473717270, 0x7f7d7e7c7b797a78};
1643 
1644 static uint64_t idx_interleave_quads[] = { \
1645  0x0703060205010400, 0x0f0b0e0a0d090c08, 0x1713161215111410, 0x1f1b1e1a1d191c18, \
1646  0x2723262225212420, 0x2f2b2e2a2d292c28, 0x3733363235313430, 0x3f3b3e3a3d393c38, \
1647  0x4743464245414440, 0x4f4b4e4a4d494c48, 0x5753565255515450, 0x5f5b5e5a5d595c58, \
1648  0x6763666265616460, 0x6f6b6e6a6d696c68, 0x7773767275717470, 0x7f7b7e7a7d797c78};
1649 
1650 #define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(_Tpvec, func) \
1651 inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
1652  CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
1653  vuint8m1_t vidx = vundefined_u8m1();\
1654  vidx = vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)); \
1655  return vrgather(vec, vidx, VTraits<v_uint8>::vlanes()); \
1656 }
1657 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, pairs)
1658 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, pairs)
1659 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, quads)
1660 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, quads)
1661 
1662 #define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(_Tpvec, width, vzext_vfx, func) \
1663 inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
1664  CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
1665  vuint##width##m1_t vidx = vundefined_u##width##m1();\
1666  vidx = vget_u##width##m1(vzext_vfx(vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)), VTraits<v_uint8>::vlanes()), 0); \
1667  return vrgather(vec, vidx, VTraits<_Tpvec>::vlanes()); \
1668 }
1669 
1670 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, pairs)
1671 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, pairs)
1672 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, pairs)
1673 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, pairs)
1674 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, pairs)
1675 
1676 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, quads)
1677 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, quads)
1678 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, quads)
1679 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, quads)
1680 OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, quads)
1681 
1682 static const unsigned char popCountTable[256] =
1684 {
1685  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1686  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1687  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1688  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1689  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1690  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1691  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1692  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1693  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1694  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1695  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1696  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1697  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1698  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1699  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1700  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
1701 };
1702 #define OPENCV_HAL_IMPL_RVV_HADD(_Tpvec, _Tpvec2, _Tm2, width, width2, suffix, add) \
1703 static inline _Tpvec2 v_hadd(_Tpvec a) { \
1704  vuint##width2##m1_t oneX2 = vmv_v_x_u##width2##m1(1, VTraits<v_uint##width2>::vlanes()); \
1705  vuint##width##m1_t one = vreinterpret_u##width##m1(oneX2); \
1706  _Tm2 res = add(a, vslide1down(a, 0, VTraits<v_uint##width>::vlanes()), VTraits<v_uint##width>::vlanes()); \
1707  return vget_##suffix##m1(vcompress(vmseq(one, 1, VTraits<v_uint##width>::vlanes()), res, res, VTraits<v_uint##width>::vlanes()), 0); \
1708 }
1709 OPENCV_HAL_IMPL_RVV_HADD(v_uint8, v_uint16, vuint16m2_t, 8, 16, u16, vwaddu_vv)
1710 OPENCV_HAL_IMPL_RVV_HADD(v_uint16, v_uint32, vuint32m2_t, 16, 32, u32, vwaddu_vv)
1711 OPENCV_HAL_IMPL_RVV_HADD(v_uint32, v_uint64, vuint64m2_t, 32, 64, u64, vwaddu_vv)
1712 OPENCV_HAL_IMPL_RVV_HADD(v_int8, v_int16, vint16m2_t, 8, 16, i16, vwadd_vv)
1713 OPENCV_HAL_IMPL_RVV_HADD(v_int16, v_int32, vint32m2_t, 16, 32, i32, vwadd_vv)
1714 OPENCV_HAL_IMPL_RVV_HADD(v_int32, v_int64, vint64m2_t, 32, 64, i64, vwadd_vv)
1715 
1716 OPENCV_HAL_IMPL_RVV_HADD(vint32m2_t, v_int32, vint32m2_t, 16, 32, i32, vadd)
1717 OPENCV_HAL_IMPL_RVV_HADD(vint64m2_t, v_int64, vint64m2_t, 32, 64, i64, vadd)
1718 
1719 inline v_uint8 v_popcount(const v_uint8& a)
1720 {
1721  return vloxei8(popCountTable, a, VTraits<v_uint8>::vlanes());
1722 }
1723 inline v_uint16 v_popcount(const v_uint16& a)
1724 {
1725  return v_hadd(v_popcount(vreinterpret_u8m1(a)));
1726 }
1727 inline v_uint32 v_popcount(const v_uint32& a)
1728 {
1729  return v_hadd(v_hadd(v_popcount(vreinterpret_u8m1(a))));
1730 }
1731 inline v_uint64 v_popcount(const v_uint64& a)
1732 {
1733  return v_hadd(v_hadd(v_hadd(v_popcount(vreinterpret_u8m1(a)))));
1734 }
1735 
1736 inline v_uint8 v_popcount(const v_int8& a)
1737 {
1738  return v_popcount(v_abs(a));\
1739 }
1740 inline v_uint16 v_popcount(const v_int16& a)
1741 {
1742  return v_popcount(v_abs(a));\
1743 }
1744 inline v_uint32 v_popcount(const v_int32& a)
1745 {
1746  return v_popcount(v_abs(a));\
1747 }
1748 inline v_uint64 v_popcount(const v_int64& a)
1749 {
1750  // max(0 - a) is used, since v_abs does not support 64-bit integers.
1751  return v_popcount(v_reinterpret_as_u64(vmax(a, v_sub(v_setzero_s64(), a), VTraits<v_int64>::vlanes())));
1752 }
1753 
1754 
1756 #define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec) \
1757 inline int v_signmask(const _Tpvec& a) \
1758 { \
1759  uint8_t ans[4] = {0}; \
1760  vsm(ans, vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
1761  return *(reinterpret_cast<int*>(ans)) & (((__int128_t)1 << VTraits<_Tpvec>::vlanes()) - 1); \
1762 } \
1763 inline int v_scan_forward(const _Tpvec& a) \
1764 { \
1765  return (int)vfirst(vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
1766 }
1767 
1768 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8)
1769 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16)
1770 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32)
1771 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64)
1772 
1773 inline int64 v_signmask(const v_uint8& a)
1774 { return v_signmask(v_reinterpret_as_s8(a)); }
1775 inline int64 v_signmask(const v_uint16& a)
1776 { return v_signmask(v_reinterpret_as_s16(a)); }
1777 inline int v_signmask(const v_uint32& a)
1778 { return v_signmask(v_reinterpret_as_s32(a)); }
1779 inline int v_signmask(const v_float32& a)
1780 { return v_signmask(v_reinterpret_as_s32(a)); }
1781 inline int v_signmask(const v_uint64& a)
1782 { return v_signmask(v_reinterpret_as_s64(a)); }
1783 #if CV_SIMD_SCALABLE_64F
1784 inline int v_signmask(const v_float64& a)
1785 { return v_signmask(v_reinterpret_as_s64(a)); }
1786 #endif
1787 
1789 inline int v_scan_forward(const v_uint8& a)
1790 { return v_scan_forward(v_reinterpret_as_s8(a)); }
1791 inline int v_scan_forward(const v_uint16& a)
1792 { return v_scan_forward(v_reinterpret_as_s16(a)); }
1793 inline int v_scan_forward(const v_uint32& a)
1794 { return v_scan_forward(v_reinterpret_as_s32(a)); }
1795 inline int v_scan_forward(const v_float32& a)
1796 { return v_scan_forward(v_reinterpret_as_s32(a)); }
1797 inline int v_scan_forward(const v_uint64& a)
1798 { return v_scan_forward(v_reinterpret_as_s64(a)); }
1799 #if CV_SIMD_SCALABLE_64F
1800 inline int v_scan_forward(const v_float64& a)
1801 { return v_scan_forward(v_reinterpret_as_s64(a)); }
1802 #endif
1803 
1805 // {A0, A1, A2, A3, B0, B1, B2, B3, C0 ...} --> {A0, A1, A2, B0, B1, B2, C0 ...}
1806 // mask: {0,0,0,1, ...} -> {T,T,T,F, ...}
1807 #define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, v_trunc) \
1808 inline _Tpvec v_pack_triplets(const _Tpvec& vec) { \
1809  size_t vl = __cv_rvv_e8m1_nlanes; \
1810  vuint32m1_t one = vmv_v_x_u32m1(1, __cv_rvv_e32m1_nlanes); \
1811  vuint8m1_t zero = vmv_v_x_u8m1(0, vl); \
1812  vuint8m1_t mask = vreinterpret_u8m1(one); \
1813  return vcompress(vmseq(v_trunc(vslideup(zero, mask, 3, vl)), 0, vl), vec, vec, VTraits<_Tpvec>::vlanes()); \
1814 }
1815 
1816 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8, OPENCV_HAL_NOP)
1817 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8, OPENCV_HAL_NOP)
1818 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16, vlmul_trunc_u8mf2)
1819 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16, vlmul_trunc_u8mf2)
1820 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32, vlmul_trunc_u8mf4)
1821 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32, vlmul_trunc_u8mf4)
1822 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32, vlmul_trunc_u8mf4)
1823 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint64, vlmul_trunc_u8mf8)
1824 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int64, vlmul_trunc_u8mf8)
1825 #if CV_SIMD_SCALABLE_64F
1826 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float64, vlmul_trunc_u8mf8)
1827 #endif
1828 
1829 
1831 
1832 #if defined(__riscv_zfh) && __riscv_zfh
1833 inline v_float32 v_load_expand(const hfloat* ptr)
1834 {
1835  return vfwcvt_f(vle16_v_f16mf2((_Float16*)ptr, VTraits<v_float32>::vlanes()) ,VTraits<v_float32>::vlanes());;
1836 }
1837 
1838 inline void v_pack_store(hfloat* ptr, const v_float32& v)
1839 {
1840  vse16_v_f16mf2((_Float16*)ptr, vfncvt_f_f_w_f16mf2(v, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
1841 }
1842 #else
1843 inline v_float32 v_load_expand(const hfloat* ptr)
1844 {
1845  float buf[32];
1846  for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) buf[i] = (float)ptr[i];
1847  return v_load(buf);
1848 }
1849 
1850 inline void v_pack_store(hfloat* ptr, const v_float32& v)
1851 {
1852  float buf[32];
1853  v_store(buf, v);
1854  for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) ptr[i] = hfloat(buf[i]);
1855 }
1856 #endif
1858 inline v_int32 v_round(const v_float32& a)
1859 {
1860  // return vfcvt_x(vfadd(a, 1e-6, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
1861  return vfcvt_x(a, VTraits<v_float32>::vlanes());
1862 }
1863 
1864 inline v_int32 v_floor(const v_float32& a)
1865 {
1866  return vfcvt_x(vfsub(a, 0.5f - 1e-5, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
1867  // return vfcvt_x(a, VTraits<v_float32>::vlanes());
1868 }
1869 
1870 inline v_int32 v_ceil(const v_float32& a)
1871 {
1872  return vfcvt_x(vfadd(a, 0.5f - 1e-5, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
1873 }
1874 
1875 inline v_int32 v_trunc(const v_float32& a)
1876 {
1877  return vfcvt_rtz_x(a, VTraits<v_float32>::vlanes());
1878 }
1879 #if CV_SIMD_SCALABLE_64F
1880 inline v_int32 v_round(const v_float64& a)
1881 {
1882  return vfncvt_x(vlmul_ext_f64m2(a), VTraits<v_float32>::vlanes());
1883 }
1884 
1885 inline v_int32 v_round(const v_float64& a, const v_float64& b)
1886 {
1887  // return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
1888  // Fix https://github.com/opencv/opencv/issues/24746
1889  return vfncvt_x(vset(vlmul_ext_f64m2(a), 1, b), VTraits<v_float32>::vlanes());
1890 }
1891 
1892 inline v_int32 v_floor(const v_float64& a)
1893 {
1894  return vfncvt_x(vlmul_ext_f64m2(vfsub(a, 0.5f - 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
1895 }
1896 
1897 inline v_int32 v_ceil(const v_float64& a)
1898 {
1899  return vfncvt_x(vlmul_ext_f64m2(vfadd(a, 0.5f - 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
1900 }
1901 
1902 inline v_int32 v_trunc(const v_float64& a)
1903 {
1904  return vfncvt_rtz_x(vlmul_ext_f64m2(a), VTraits<v_float32>::vlanes());
1905 }
1906 #endif
1907 
1909 
1910 // 16 >> 32
1911 inline v_int32 v_dotprod(const v_int16& a, const v_int16& b)
1912 {
1913  vint32m2_t temp1 = vwmul(a, b, VTraits<v_int16>::vlanes());
1914  return v_hadd(temp1);
1915 }
1916 
1917 inline v_int32 v_dotprod(const v_int16& a, const v_int16& b, const v_int32& c)
1918 {
1919  vint32m2_t temp1 = vwmul(a, b, VTraits<v_int16>::vlanes());
1920  return vadd(v_hadd(temp1), c, VTraits<v_int32>::vlanes());
1921 }
1922 
1923 // 32 >> 64
1924 inline v_int64 v_dotprod(const v_int32& a, const v_int32& b)
1925 {
1926  vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
1927  vuint32m1_t one32 = vreinterpret_u32m1(one64); \
1928  vbool32_t mask = vmseq(one32, 1, VTraits<v_uint32>::vlanes()); \
1929  vint64m2_t temp1 = vwmul(a, b, VTraits<v_int32>::vlanes()); \
1930  vint64m2_t temp2 = vslide1down(temp1, 0, VTraits<v_int32>::vlanes());
1931  vint64m2_t res = vadd(temp1, temp2, VTraits<v_int32>::vlanes());
1932  res = vcompress(mask, res, res, VTraits<v_int32>::vlanes()); \
1933  return vlmul_trunc_i64m1(res); \
1934 }
1935 inline v_int64 v_dotprod(const v_int32& a, const v_int32& b, const v_int64& c)
1936 {
1937  vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
1938  vuint32m1_t one32 = vreinterpret_u32m1(one64); \
1939  vbool32_t mask = vmseq(one32, 1, VTraits<v_uint32>::vlanes()); \
1940  vint64m2_t temp1 = vwmul(a, b, VTraits<v_int32>::vlanes()); \
1941  vint64m2_t temp2 = vslide1down(temp1, 0, VTraits<v_int32>::vlanes());
1942  vint64m2_t res = vadd(temp1, temp2, VTraits<v_int32>::vlanes());
1943  res = vcompress(mask, res, res, VTraits<v_int32>::vlanes()); \
1944  return vadd(vlmul_trunc_i64m1(res), c, VTraits<v_int64>::vlanes()); \
1945 }
1946 
1947 // 8 >> 32
1948 inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b)
1949 {
1950  vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
1951  vuint8m1_t one8 = vreinterpret_u8m1(one32); \
1952  vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
1953  vuint16m2_t t0 = vwmulu(a, b, VTraits<v_uint8>::vlanes()); \
1954  vuint16m2_t t1= vslide1down(t0, 0, VTraits<v_uint8>::vlanes());
1955  vuint16m2_t t2= vslide1down(t1, 0, VTraits<v_uint8>::vlanes());
1956  vuint16m2_t t3= vslide1down(t2, 0, VTraits<v_uint8>::vlanes());
1957  vuint32m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint8>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint8>::vlanes()), VTraits<v_uint8>::vlanes());
1958  res = vcompress(mask, res, res, VTraits<v_uint8>::vlanes()); \
1959  return vlmul_trunc_u32m1(res);
1960 }
1961 
1962 inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b,
1963  const v_uint32& c)
1964 {
1965  vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
1966  vuint8m1_t one8 = vreinterpret_u8m1(one32); \
1967  vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
1968  vuint16m2_t t0 = vwmulu(a, b, VTraits<v_uint8>::vlanes()); \
1969  vuint16m2_t t1= vslide1down(t0, 0, VTraits<v_uint8>::vlanes());
1970  vuint16m2_t t2= vslide1down(t1, 0, VTraits<v_uint8>::vlanes());
1971  vuint16m2_t t3= vslide1down(t2, 0, VTraits<v_uint8>::vlanes());
1972  vuint32m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint8>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint8>::vlanes()), VTraits<v_uint8>::vlanes());
1973  res = vcompress(mask, res, res, VTraits<v_uint8>::vlanes()); \
1974  return vadd(vlmul_trunc_u32m1(res), c, VTraits<v_uint8>::vlanes());
1975 }
1976 
1977 inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b)
1978 {
1979  vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
1980  vuint8m1_t one8 = vreinterpret_u8m1(one32); \
1981  vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
1982  vint16m2_t t0 = vwmul(a, b, VTraits<v_int8>::vlanes()); \
1983  vint16m2_t t1= vslide1down(t0, 0, VTraits<v_int8>::vlanes());
1984  vint16m2_t t2= vslide1down(t1, 0, VTraits<v_int8>::vlanes());
1985  vint16m2_t t3= vslide1down(t2, 0, VTraits<v_int8>::vlanes());
1986  vint32m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int8>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int8>::vlanes()), VTraits<v_int8>::vlanes());
1987  res = vcompress(mask, res, res, VTraits<v_int8>::vlanes()); \
1988  return vlmul_trunc_i32m1(res);
1989 }
1990 
1991 inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b,
1992  const v_int32& c)
1993 {
1994  vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
1995  vuint8m1_t one8 = vreinterpret_u8m1(one32); \
1996  vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
1997  vint16m2_t t0 = vwmul(a, b, VTraits<v_int8>::vlanes()); \
1998  vint16m2_t t1= vslide1down(t0, 0, VTraits<v_int8>::vlanes());
1999  vint16m2_t t2= vslide1down(t1, 0, VTraits<v_int8>::vlanes());
2000  vint16m2_t t3= vslide1down(t2, 0, VTraits<v_int8>::vlanes());
2001  vint32m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int8>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int8>::vlanes()), VTraits<v_int8>::vlanes());
2002  res = vcompress(mask, res, res, VTraits<v_int8>::vlanes()); \
2003  return vadd(vlmul_trunc_i32m1(res), c, VTraits<v_int8>::vlanes());
2004 }
2005 
2006 
2007 // // 16 >> 64
2008 inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b)
2009 {
2010  vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
2011  vuint16m1_t one16 = vreinterpret_u16m1(one64); \
2012  vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
2013  vuint32m2_t t0 = vwmulu(a, b, VTraits<v_uint16>::vlanes()); \
2014  vuint32m2_t t1= vslide1down(t0, 0, VTraits<v_uint16>::vlanes());
2015  vuint32m2_t t2= vslide1down(t1, 0, VTraits<v_uint16>::vlanes());
2016  vuint32m2_t t3= vslide1down(t2, 0, VTraits<v_uint16>::vlanes());
2017  vuint64m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint16>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint16>::vlanes()), VTraits<v_uint16>::vlanes());
2018  res = vcompress(mask, res, res, VTraits<v_uint16>::vlanes()); \
2019  return vlmul_trunc_u64m1(res);
2020 }
2021 inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b, const v_uint64& c)
2022 {
2023  vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
2024  vuint16m1_t one16 = vreinterpret_u16m1(one64); \
2025  vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
2026  vuint32m2_t t0 = vwmulu(a, b, VTraits<v_uint16>::vlanes()); \
2027  vuint32m2_t t1= vslide1down(t0, 0, VTraits<v_uint16>::vlanes());
2028  vuint32m2_t t2= vslide1down(t1, 0, VTraits<v_uint16>::vlanes());
2029  vuint32m2_t t3= vslide1down(t2, 0, VTraits<v_uint16>::vlanes());
2030  vuint64m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint16>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint16>::vlanes()), VTraits<v_uint16>::vlanes());
2031  res = vcompress(mask, res, res, VTraits<v_uint16>::vlanes()); \
2032  return vadd(vlmul_trunc_u64m1(res), c, VTraits<v_uint16>::vlanes());
2033 }
2034 
2035 inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b)
2036 {
2037  vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
2038  vuint16m1_t one16 = vreinterpret_u16m1(one64); \
2039  vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
2040  vint32m2_t t0 = vwmul(a, b, VTraits<v_int16>::vlanes()); \
2041  vint32m2_t t1= vslide1down(t0, 0, VTraits<v_int16>::vlanes());
2042  vint32m2_t t2= vslide1down(t1, 0, VTraits<v_int16>::vlanes());
2043  vint32m2_t t3= vslide1down(t2, 0, VTraits<v_int16>::vlanes());
2044  vint64m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int16>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int16>::vlanes()), VTraits<v_int16>::vlanes());
2045  res = vcompress(mask, res, res, VTraits<v_int16>::vlanes()); \
2046  return vlmul_trunc_i64m1(res);
2047 }
2048 inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b,
2049  const v_int64& c)
2050 {
2051  vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
2052  vuint16m1_t one16 = vreinterpret_u16m1(one64); \
2053  vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
2054  vint32m2_t t0 = vwmul(a, b, VTraits<v_int16>::vlanes()); \
2055  vint32m2_t t1= vslide1down(t0, 0, VTraits<v_int16>::vlanes());
2056  vint32m2_t t2= vslide1down(t1, 0, VTraits<v_int16>::vlanes());
2057  vint32m2_t t3= vslide1down(t2, 0, VTraits<v_int16>::vlanes());
2058  vint64m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int16>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int16>::vlanes()), VTraits<v_int16>::vlanes());
2059  res = vcompress(mask, res, res, VTraits<v_int16>::vlanes()); \
2060  return vadd(vlmul_trunc_i64m1(res), c, VTraits<v_int16>::vlanes());
2061 }
2062 
2063 // // 32 >> 64f
2064 #if CV_SIMD_SCALABLE_64F
2065 inline v_float64 v_dotprod_expand(const v_int32& a, const v_int32& b)
2066 { return v_cvt_f64(v_dotprod(a, b)); }
2067 inline v_float64 v_dotprod_expand(const v_int32& a, const v_int32& b,
2068  const v_float64& c)
2069 { return v_add(v_dotprod_expand(a, b) , c); }
2070 #endif
2071 
2073 // 16 >> 32
2074 inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b)
2075 {
2076  v_int32 zero = v_setzero_s32();
2077  return vredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero, VTraits<v_int16>::vlanes());
2078 }
2079 inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b, const v_int32& c)
2080 {
2081  v_int32 zero = v_setzero_s32();
2082  return vredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), vredsum(zero, c, zero, VTraits<v_int32>::vlanes()), VTraits<v_int16>::vlanes());
2083 }
2084 
2085 // 32 >> 64
2086 inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b)
2087 {
2088  v_int64 zero = v_setzero_s64();
2089  return vredsum(zero, vwmul(a, b, VTraits<v_int32>::vlanes()), zero, VTraits<v_int32>::vlanes());
2090 }
2091 inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b, const v_int64& c)
2092 {
2093  v_int64 zero = v_setzero_s64();
2094  return vadd(vredsum(zero, vwmul(a, b, VTraits<v_int32>::vlanes()), zero, VTraits<v_int32>::vlanes()) , vredsum(zero, c, zero, VTraits<v_int64>::vlanes()), VTraits<v_int64>::vlanes());
2095 }
2096 
2097 
2098 // 8 >> 32
2099 inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b)
2100 {
2101  v_uint32 zero = v_setzero_u32();
2102  return vwredsumu(zero, vwmulu(a, b, VTraits<v_uint8>::vlanes()), zero, VTraits<v_uint8>::vlanes());
2103 }
2104 inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b, const v_uint32& c)
2105 {
2106  v_uint32 zero = v_setzero_u32();
2107  return vadd(vwredsumu(zero, vwmulu(a, b, VTraits<v_uint8>::vlanes()), zero, VTraits<v_uint8>::vlanes()) , vredsum(zero, c, zero, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
2108 }
2109 inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b)
2110 {
2111  v_int32 zero = v_setzero_s32();
2112  return vwredsum(zero, vwmul(a, b, VTraits<v_int8>::vlanes()), zero, VTraits<v_int8>::vlanes());
2113 }
2114 inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b, const v_int32& c)
2115 {
2116  v_int32 zero = v_setzero_s32();
2117  return vadd(vwredsum(zero, vwmul(a, b, VTraits<v_int8>::vlanes()), zero, VTraits<v_int8>::vlanes()) , vredsum(zero, c, zero, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
2118 }
2119 
2120 // 16 >> 64
2121 inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b)
2122 {
2123  v_uint64 zero = v_setzero_u64();
2124  return vwredsumu(zero, vwmulu(a, b, VTraits<v_uint16>::vlanes()), zero, VTraits<v_uint16>::vlanes());
2125 }
2126 inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b, const v_uint64& c)
2127 {
2128  v_uint64 zero = v_setzero_u64();
2129  return vadd(vwredsumu(zero, vwmulu(a, b, VTraits<v_uint16>::vlanes()), zero, VTraits<v_uint16>::vlanes()), vredsum(zero, c, zero, VTraits<v_uint64>::vlanes()), VTraits<v_uint64>::vlanes());
2130 }
2131 inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b)
2132 {
2133  v_int64 zero = v_setzero_s64();
2134  return vwredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero, VTraits<v_int16>::vlanes());
2135 }
2136 inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b, const v_int64& c)
2137 {
2138  v_int64 zero = v_setzero_s64();
2139  return vadd(vwredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero, VTraits<v_int16>::vlanes()), vredsum(zero, c, zero, VTraits<v_int64>::vlanes()), VTraits<v_int64>::vlanes());
2140 }
2141 
2142 // 32 >> 64f
2143 #if CV_SIMD_SCALABLE_64F
2144 inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b)
2145 { return v_cvt_f64(v_dotprod_fast(a, b)); }
2146 inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b, const v_float64& c)
2147 { return v_add(v_dotprod_expand_fast(a, b) , c); }
2148 #endif
2149 
2150 // TODO: only 128 bit now.
2151 inline v_float32 v_matmul(const v_float32& v, const v_float32& m0,
2152  const v_float32& m1, const v_float32& m2,
2153  const v_float32& m3)
2154 {
2155  vfloat32m1_t res;
2156  res = vfmul_vf_f32m1(m0, v_extract_n(v, 0), VTraits<v_float32>::vlanes());
2157  res = vfmacc_vf_f32m1(res, v_extract_n(v, 1), m1, VTraits<v_float32>::vlanes());
2158  res = vfmacc_vf_f32m1(res, v_extract_n(v, 2), m2, VTraits<v_float32>::vlanes());
2159  res = vfmacc_vf_f32m1(res, v_extract_n(v, 3), m3, VTraits<v_float32>::vlanes());
2160  return res;
2161 }
2162 
2163 // TODO: only 128 bit now.
2164 inline v_float32 v_matmuladd(const v_float32& v, const v_float32& m0,
2165  const v_float32& m1, const v_float32& m2,
2166  const v_float32& a)
2167 {
2168  vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n(v,0), VTraits<v_float32>::vlanes());
2169  res = vfmacc_vf_f32m1(res, v_extract_n(v,1), m1, VTraits<v_float32>::vlanes());
2170  res = vfmacc_vf_f32m1(res, v_extract_n(v,2), m2, VTraits<v_float32>::vlanes());
2171  return vfadd(res, a, VTraits<v_float32>::vlanes());
2172 }
2173 
2174 inline void v_cleanup() {}
2175 
2176 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2177 
2179 
2180 } //namespace cv
2181 
2182 #endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst)
Calculates the per-element absolute difference between two arrays or between an array and a scalar.
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
int int idx1
Definition: core_c.h:654
const int * idx
Definition: core_c.h:668
const CvArr CvArr * x
Definition: core_c.h:1195
int idx0
Definition: core_c.h:652
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition: intrin_cpp.hpp:1554
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition: intrin_cpp.hpp:1584
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition: intrin_cpp.hpp:2190
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition: intrin_cpp.hpp:1020
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition: intrin_cpp.hpp:1033
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition: intrin_cpp.hpp:2761
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition: intrin_cpp.hpp:2397
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1451
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
CvArr CvArr * temp
Definition: imgproc_c.h:329
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441