EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_rvv_scalable.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html.
4
5// The original implementation is contributed by HAN Liutong.
6// Copyright (C) 2022, Institute of Software, Chinese Academy of Sciences.
7
8#ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
9#define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
10
11#include <opencv2/core/check.hpp>
12
13// RVV intrinsics have been renamed in version 0.11, so we need to include
14// compatibility headers:
15// https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
16#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>10999
17#include "intrin_rvv_010_compat_non-policy.hpp"
18#include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
19#endif
20
21#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
22#include "intrin_rvv_011_compat.hpp"
23#endif
24
25#if defined(__GNUC__) && !defined(__clang__)
26// FIXIT: eliminate massive warnigs from templates
27// GCC from 'rvv-next': riscv64-unknown-linux-gnu-g++ (g42df3464463) 12.0.1 20220505 (prerelease)
28// doesn't work: #pragma GCC diagnostic push
29#pragma GCC diagnostic ignored "-Wignored-attributes"
30#endif
31
32#ifndef CV_RVV_MAX_VLEN
33#define CV_RVV_MAX_VLEN 1024
34#endif
35
36namespace cv
37{
38
40
41CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
42
43#define CV_SIMD_SCALABLE 1
44#define CV_SIMD_SCALABLE_64F 1
45
46using v_uint8 = vuint8m1_t;
47using v_int8 = vint8m1_t;
48using v_uint16 = vuint16m1_t;
49using v_int16 = vint16m1_t;
50using v_uint32 = vuint32m1_t;
51using v_int32 = vint32m1_t;
52using v_uint64 = vuint64m1_t;
53using v_int64 = vint64m1_t;
54
55using v_float32 = vfloat32m1_t;
56#if CV_SIMD_SCALABLE_64F
57using v_float64 = vfloat64m1_t;
58#endif
59
60using uchar = unsigned char;
61using schar = signed char;
62using ushort = unsigned short;
63using uint = unsigned int;
64using uint64 = unsigned long int;
65using int64 = long int;
66
67static const int __cv_rvv_e8m1_nlanes = vsetvlmax_e8m1();
68static const int __cv_rvv_e16m1_nlanes = vsetvlmax_e16m1();
69static const int __cv_rvv_e32m1_nlanes = vsetvlmax_e32m1();
70static const int __cv_rvv_e64m1_nlanes = vsetvlmax_e64m1();
71static const int __cv_rvv_e8m2_nlanes = vsetvlmax_e8m2();
72static const int __cv_rvv_e16m2_nlanes = vsetvlmax_e16m2();
73static const int __cv_rvv_e32m2_nlanes = vsetvlmax_e32m2();
74static const int __cv_rvv_e64m2_nlanes = vsetvlmax_e64m2();
75static const int __cv_rvv_e8m4_nlanes = vsetvlmax_e8m4();
76static const int __cv_rvv_e16m4_nlanes = vsetvlmax_e16m4();
77static const int __cv_rvv_e32m4_nlanes = vsetvlmax_e32m4();
78static const int __cv_rvv_e64m4_nlanes = vsetvlmax_e64m4();
79static const int __cv_rvv_e8m8_nlanes = vsetvlmax_e8m8();
80static const int __cv_rvv_e16m8_nlanes = vsetvlmax_e16m8();
81static const int __cv_rvv_e32m8_nlanes = vsetvlmax_e32m8();
82static const int __cv_rvv_e64m8_nlanes = vsetvlmax_e64m8();
83
84template <class T>
85struct VTraits;
86
87#define OPENCV_HAL_IMPL_RVV_TRAITS(REG, TYP, SUF, SZ) \
88template <> \
89struct VTraits<REG> \
90{ \
91 static inline int vlanes() { return __cv_rvv_##SUF##_nlanes; } \
92 using lane_type = TYP; \
93 static const int max_nlanes = CV_RVV_MAX_VLEN/SZ; \
94};
95
96OPENCV_HAL_IMPL_RVV_TRAITS(vint8m1_t, int8_t, e8m1, 8)
97OPENCV_HAL_IMPL_RVV_TRAITS(vint8m2_t, int8_t, e8m2, 8)
98OPENCV_HAL_IMPL_RVV_TRAITS(vint8m4_t, int8_t, e8m4, 8)
99OPENCV_HAL_IMPL_RVV_TRAITS(vint8m8_t, int8_t, e8m8, 8)
100OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m1_t, uint8_t, e8m1, 8)
101OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m2_t, uint8_t, e8m2, 8)
102OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m4_t, uint8_t, e8m4, 8)
103OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m8_t, uint8_t, e8m8, 8)
104
105OPENCV_HAL_IMPL_RVV_TRAITS(vint16m1_t, int16_t, e16m1, 16)
106OPENCV_HAL_IMPL_RVV_TRAITS(vint16m2_t, int16_t, e16m2, 16)
107OPENCV_HAL_IMPL_RVV_TRAITS(vint16m4_t, int16_t, e16m4, 16)
108OPENCV_HAL_IMPL_RVV_TRAITS(vint16m8_t, int16_t, e16m8, 16)
109OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m1_t, uint16_t, e16m1, 16)
110OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m2_t, uint16_t, e16m2, 16)
111OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m4_t, uint16_t, e16m4, 16)
112OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m8_t, uint16_t, e16m8, 16)
113
114OPENCV_HAL_IMPL_RVV_TRAITS(vint32m1_t, int32_t, e32m1, 32)
115OPENCV_HAL_IMPL_RVV_TRAITS(vint32m2_t, int32_t, e32m2, 32)
116OPENCV_HAL_IMPL_RVV_TRAITS(vint32m4_t, int32_t, e32m4, 32)
117OPENCV_HAL_IMPL_RVV_TRAITS(vint32m8_t, int32_t, e32m8, 32)
118OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m1_t, uint32_t, e32m1, 32)
119OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m2_t, uint32_t, e32m2, 32)
120OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m4_t, uint32_t, e32m4, 32)
121OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m8_t, uint32_t, e32m8, 32)
122
123OPENCV_HAL_IMPL_RVV_TRAITS(vint64m1_t, int64_t, e64m1, 64)
124OPENCV_HAL_IMPL_RVV_TRAITS(vint64m2_t, int64_t, e64m2, 64)
125OPENCV_HAL_IMPL_RVV_TRAITS(vint64m4_t, int64_t, e64m4, 64)
126OPENCV_HAL_IMPL_RVV_TRAITS(vint64m8_t, int64_t, e64m8, 64)
127OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m1_t, uint64_t, e64m1, 64)
128OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m2_t, uint64_t, e64m2, 64)
129OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m4_t, uint64_t, e64m4, 64)
130OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m8_t, uint64_t, e64m8, 64)
131
132OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m1_t, float, e32m1, 32)
133OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m2_t, float, e32m2, 32)
134OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m4_t, float, e32m4, 32)
135OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m8_t, float, e32m8, 32)
136
137#if CV_SIMD_SCALABLE_64F
138OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m1_t, double, e64m1, 64)
139OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m2_t, double, e64m2, 64)
140OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m4_t, double, e64m4, 64)
141OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m8_t, double, e64m8, 64)
142#endif
143
144
145// LLVM/Clang defines "overloaded intrinsics" e.g. 'vand(op1, op2)'
146// GCC does not have these functions, so we need to implement them manually
147// We implement only selected subset required to build current state of the code
148// Included inside namespace cv::
149#ifndef __riscv_v_intrinsic_overloading
150#include "intrin_rvv_compat_overloaded.hpp"
151#endif // __riscv_v_intrinsic_overloading
152
153
155#define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \
156inline _Tp v_get0(const v_##_Tpvec& v) \
157{ \
158 return vmv_x(v); \
159}
160
161OPENCV_HAL_IMPL_RVV_GRT0_INT(uint8, uchar)
162OPENCV_HAL_IMPL_RVV_GRT0_INT(int8, schar)
163OPENCV_HAL_IMPL_RVV_GRT0_INT(uint16, ushort)
164OPENCV_HAL_IMPL_RVV_GRT0_INT(int16, short)
165OPENCV_HAL_IMPL_RVV_GRT0_INT(uint32, unsigned)
166OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
167OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
168OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
169
170inline float v_get0(const v_float32& v) \
171{ \
172 return vfmv_f(v); \
173}
174#if CV_SIMD_SCALABLE_64F
175inline double v_get0(const v_float64& v) \
176{ \
177 return vfmv_f(v); \
178}
179#endif
180
182
183#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
184inline v_##_Tpvec v_setzero_##suffix1() \
185{ \
186 return vmv_v_x_##suffix2##m1(0, vl); \
187} \
188inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
189{ \
190 return vmv_v_x_##suffix2##m1(v, vl); \
191}
192
193OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits<v_uint8>::vlanes())
194OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8, schar, s8, i8, VTraits<v_int8>::vlanes())
195OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16, ushort, u16, u16, VTraits<v_uint16>::vlanes())
196OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16, short, s16, i16, VTraits<v_int16>::vlanes())
197OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32, uint, u32, u32, VTraits<v_uint32>::vlanes())
198OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32, int, s32, i32, VTraits<v_int32>::vlanes())
199OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64, uint64, u64, u64, VTraits<v_uint64>::vlanes())
200OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits<v_int64>::vlanes())
201
202#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
203inline v_##_Tpv v_setzero_##suffix() \
204{ \
205 return vfmv_v_f_##suffix##m1(0, vl); \
206} \
207inline v_##_Tpv v_setall_##suffix(_Tp v) \
208{ \
209 return vfmv_v_f_##suffix##m1(v, vl); \
210}
211
212OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
213#if CV_SIMD_SCALABLE_64F
214OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
215#endif
216
218#define OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(_Tpvec1, suffix1) \
219inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec1& v) \
220{ \
221 return v;\
222}
223OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint8, u8)
224OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint16, u16)
225OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint32, u32)
226OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint64, u64)
227OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
228OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
229OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
230OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
231OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
232#if CV_SIMD_SCALABLE_64F
233OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
234#endif
235// TODO: can be simplified by using overloaded RV intrinsic
236#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
237inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
238{ \
239 return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
240} \
241inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
242{ \
243 return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
244}
245
246OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
247OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
248OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
249OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
250OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
251OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
252#if CV_SIMD_SCALABLE_64F
253OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, float64, u64, f64, u64, f64)
254OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64, float64, s64, f64, i64, f64)
255#endif
256OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint16, u8, u16, u8, u16)
257OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint32, u8, u32, u8, u32)
258OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint64, u8, u64, u8, u64)
259OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint32, u16, u32, u16, u32)
260OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint64, u16, u64, u16, u64)
261OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, uint64, u32, u64, u32, u64)
262OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int16, s8, s16, i8, i16)
263OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int32, s8, s32, i8, i32)
264OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int64, s8, s64, i8, i64)
265OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int32, s16, s32, i16, i32)
266OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int64, s16, s64, i16, i64)
267OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64)
268
269
270#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
271inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
272{ \
273 return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v));\
274} \
275inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
276{ \
277 return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v));\
278}
279
280OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16)
281OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int32, u8, s32, u, i, 8, 32)
282OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int64, u8, s64, u, i, 8, 64)
283OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int8, u16, s8, u, i, 16, 8)
284OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int32, u16, s32, u, i, 16, 32)
285OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int64, u16, s64, u, i, 16, 64)
286OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int8, u32, s8, u, i, 32, 8)
287OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int16, u32, s16, u, i, 32, 16)
288OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
289OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
290OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
291OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
292OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
293OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
294OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
295OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float32, s8, f32, i, f, 8, 32)
296OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float32, s16, f32, i, f, 16, 32)
297OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float32, s64, f32, i, f, 64, 32)
298#if CV_SIMD_SCALABLE_64F
299OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float64, u8, f64, u, f, 8, 64)
300OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float64, u16, f64, u, f, 16, 64)
301OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float64, u32, f64, u, f, 32, 64)
302OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
303OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
304OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
305// Three times reinterpret
306inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
307{ \
308 return vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v)));\
309}
310
311inline v_float64 v_reinterpret_as_f64(const v_float32& v) \
312{ \
313 return vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v)));\
314}
315#endif
316
318
319#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vl) \
320template <int s = 0> \
321inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
322{ \
323 return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
324} \
325template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
326{ \
327 return vmv_x(vslidedown(v_setzero_##suffix(), v, i, vl)); \
328}
329
330
331OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8, uchar, u8, VTraits<v_uint8>::vlanes())
332OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8, schar, s8, VTraits<v_int8>::vlanes())
333OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16, ushort, u16, VTraits<v_uint16>::vlanes())
334OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16, short, s16, VTraits<v_int16>::vlanes())
335OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32, unsigned int, u32, VTraits<v_uint32>::vlanes())
336OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32, int, s32, VTraits<v_int32>::vlanes())
337OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64, uint64, u64, VTraits<v_uint64>::vlanes())
338OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64, int64, s64, VTraits<v_int64>::vlanes())
339
340#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vl) \
341template <int s = 0> \
342inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
343{ \
344 return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
345} \
346template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
347{ \
348 return vfmv_f(vslidedown(v_setzero_##suffix(), v, i, vl)); \
349}
350
351OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32, float, f32, VTraits<v_float32>::vlanes())
352#if CV_SIMD_SCALABLE_64F
353OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, f64, VTraits<v_float64>::vlanes())
354#endif
355
356#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, vl) \
357inline _Tp v_extract_highest(_Tpvec v) \
358{ \
359 return v_extract_n(v, vl-1); \
360}
361
362OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::vlanes())
363OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8, schar, VTraits<v_int8>::vlanes())
364OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::vlanes())
365OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16, short, VTraits<v_int16>::vlanes())
366OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::vlanes())
367OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32, int, VTraits<v_int32>::vlanes())
368OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::vlanes())
369OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64, int64, VTraits<v_int64>::vlanes())
370OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32, float, VTraits<v_float32>::vlanes())
371#if CV_SIMD_SCALABLE_64F
372OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits<v_float64>::vlanes())
373#endif
374
375
377#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
378inline _Tpvec v_load(const _Tp* ptr) \
379{ \
380 return vle##width##_v_##suffix##m1(ptr, vl); \
381} \
382inline _Tpvec v_load_aligned(const _Tp* ptr) \
383{ \
384 return vle##width##_v_##suffix##m1(ptr, vl); \
385} \
386inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
387{ \
388 vse##width##_v_##suffix##m1(ptr, a, vl); \
389} \
390inline _Tpvec v_load_low(const _Tp* ptr) \
391{ \
392 return vle##width##_v_##suffix##m1(ptr, hvl); \
393} \
394inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
395{ \
396 return vslideup(vle##width##_v_##suffix##m1(ptr0, hvl), vle##width##_v_##suffix##m1(ptr1, hvl), hvl, vl); \
397} \
398inline void v_store(_Tp* ptr, const _Tpvec& a) \
399{ \
400 vse##width(ptr, a, vl); \
401} \
402inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
403{ \
404 vse##width(ptr, a, vl); \
405} \
406inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
407{ \
408 vse##width(ptr, a, vl); \
409} \
410inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
411{ \
412 vse##width(ptr, a, hvl); \
413} \
414inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
415{ \
416 vse##width(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
417} \
418template<typename... Targs> \
419_Tpvec v_load_##suffix(Targs... nScalars) \
420{ \
421 return v_load({nScalars...}); \
422}
423
424
425OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8, vmv_v_x_u8m1)
426OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8, vmv_v_x_i8m1)
427OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m1_t, ushort, VTraits<v_uint16>::vlanes() / 2, VTraits<v_uint16>::vlanes(), 16, u16, vmv_v_x_u16m1)
428OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m1_t, short, VTraits<v_int16>::vlanes() / 2, VTraits<v_int16>::vlanes(), 16, i16, vmv_v_x_i16m1)
429OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits<v_uint32>::vlanes() / 2, VTraits<v_uint32>::vlanes(), 32, u32, vmv_v_x_u32m1)
430OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32, vmv_v_x_i32m1)
431OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64, vmv_v_x_u64m1)
432OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64, vmv_v_x_i64m1)
433OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32, vfmv_v_f_f32m1)
434
435#if CV_SIMD_SCALABLE_64F
436OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64, vfmv_v_f_f64m1)
437#endif
438
440#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
441inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
442{ \
443 auto vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
444 return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
445}
446OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
447OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
448OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
449OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
450OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
451#if CV_SIMD_SCALABLE_64F
452OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
453#endif
454
455#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \
456inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
457{ \
458 auto v0 = vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \
459 auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/2); \
460 auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/2); \
461 auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/2); \
462 auto sh1 = vslide1up(v_trunc(vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \
463 auto vid = vor(sh1, v_trunc(vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \
464 auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
465 return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
466}
467OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m2, m4, OPENCV_HAL_NOP)
468OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m1, m2, OPENCV_HAL_NOP)
469OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, mf2, m1, OPENCV_HAL_NOP)
470OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, mf2, m1, OPENCV_HAL_NOP)
471OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, mf2, m1, vlmul_trunc_u32mf2)
472#if CV_SIMD_SCALABLE_64F
473OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, mf2, m1, vlmul_trunc_u32mf2)
474#endif
475
476
477#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \
478inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
479{ \
480 auto v0 = vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \
481 auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/4); \
482 auto v2 = vadd(v0, 2, VTraits<_Tpvec>::vlanes()/4); \
483 auto v3 = vadd(v0, 3, VTraits<_Tpvec>::vlanes()/4); \
484 auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/4); \
485 auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/4); \
486 auto w2 = vwcvtu_x(v2, VTraits<_Tpvec>::vlanes()/4); \
487 auto w3 = vwcvtu_x(v3, VTraits<_Tpvec>::vlanes()/4); \
488 auto sh2 = vslide1up(vreinterpret_u32##suffix1(w2),0, VTraits<_Tpvec>::vlanes()/2); \
489 auto sh3 = vslide1up(vreinterpret_u32##suffix1(w3),0, VTraits<_Tpvec>::vlanes()/2); \
490 auto vid0 = vor(sh2, vreinterpret_u32##suffix1(w0), VTraits<_Tpvec>::vlanes()/2); \
491 auto vid1 = vor(sh3, vreinterpret_u32##suffix1(w1), VTraits<_Tpvec>::vlanes()/2); \
492 auto wid0 = vwcvtu_x(v_trunc(vid0), VTraits<_Tpvec>::vlanes()/2); \
493 auto wid1 = vwcvtu_x(v_trunc(vid1), VTraits<_Tpvec>::vlanes()/2); \
494 auto shwid1 = vslide1up(vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \
495 auto vid = vor(shwid1, vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \
496 auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
497 return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
498}
499OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m1, m2, m4, OPENCV_HAL_NOP)
500OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, mf2 , m1, m2, OPENCV_HAL_NOP)
501OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, mf2, m1, m1, vlmul_trunc_u32mf2)
502OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, mf2, m1, m1, vlmul_trunc_u32mf2)
503
504#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
505inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
506{ \
507 v_uint32 vidx_ = vmul(vreinterpret_u32m1(vidx), sizeof(_Tp), VTraits<v_int32>::vlanes()); \
508 return vloxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \
509}
510OPENCV_HAL_IMPL_RVV_LUT_VEC(v_float32, float)
511OPENCV_HAL_IMPL_RVV_LUT_VEC(v_int32, int)
512OPENCV_HAL_IMPL_RVV_LUT_VEC(v_uint32, unsigned)
513
514#if CV_SIMD_SCALABLE_64F
515inline v_float64 v_lut(const double* tab, const v_int32& vidx) \
516{ \
517 vuint32mf2_t vidx_ = vmul(vlmul_trunc_u32mf2(vreinterpret_u32m1(vidx)), sizeof(double), VTraits<v_float64>::vlanes()); \
518 return vloxei32(tab, vidx_, VTraits<v_float64>::vlanes()); \
519}
520#endif
521
522
523inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
524inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
525inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
526inline v_uint16 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
527inline v_uint16 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
528inline v_uint16 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
529inline v_uint32 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
530inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
531inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
532inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
533inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
534
536inline v_uint8 v_pack_b(const v_uint16& a, const v_uint16& b)
537{
538 return vnsrl(vset(vlmul_ext_v_u16m1_u16m2(a),1,b), 0, VTraits<v_uint8>::vlanes());
539}
540
541inline v_uint8 v_pack_b(const v_uint32& a, const v_uint32& b,
542 const v_uint32& c, const v_uint32& d)
543{
544
545 return vnsrl(vnsrl(vset(vset(vset(vlmul_ext_u32m4(a),1,b),2,c),3,d), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
546}
547
548inline v_uint8 v_pack_b(const v_uint64& a, const v_uint64& b, const v_uint64& c,
549 const v_uint64& d, const v_uint64& e, const v_uint64& f,
550 const v_uint64& g, const v_uint64& h)
551{
552 return vnsrl(vnsrl(vnsrl(
553 vset(vset(vset(vset(vset(vset(vset(vlmul_ext_u64m8(a),
554 1,b),2,c),3,d),4,e),5,f),6,g),7,h),
555 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
556}
557
559#define OPENCV_HAL_IMPL_RVV_BIN_OP(_Tpvec, ocv_intrin, rvv_intrin) \
560inline _Tpvec v_##ocv_intrin(const _Tpvec& a, const _Tpvec& b) \
561{ \
562 return rvv_intrin(a, b, VTraits<_Tpvec>::vlanes()); \
563}
564
565OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add, vsaddu)
566OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub, vssubu)
567OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add, vsadd)
568OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub, vssub)
569OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, vsaddu)
570OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, vssubu)
571OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, vsadd)
572OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, vssub)
573OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, vadd)
574OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, vsub)
575OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, vmul)
576OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, add, vadd)
577OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, sub, vsub)
578OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, mul, vmul)
579OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, add, vfadd)
580OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, sub, vfsub)
581OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, mul, vfmul)
582OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, div, vfdiv)
583OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, add, vadd)
584OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, sub, vsub)
585OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, add, vadd)
586OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, sub, vsub)
587
588#if CV_SIMD_SCALABLE_64F
589OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, add, vfadd)
590OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, sub, vfsub)
591OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, mul, vfmul)
592OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, div, vfdiv)
593#endif
594
595#define OPENCV_HAL_IMPL_RVV_BIN_MADD(_Tpvec, rvv_add) \
596template<typename... Args> \
597inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
598 return v_add(rvv_add(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
599}
600#define OPENCV_HAL_IMPL_RVV_BIN_MMUL(_Tpvec, rvv_mul) \
601template<typename... Args> \
602inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
603 return v_mul(rvv_mul(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
604}
605OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint8, vsaddu)
606OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int8, vsadd)
607OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint16, vsaddu)
608OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int16, vsadd)
609OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint32, vadd)
610OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int32, vadd)
611OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float32, vfadd)
612OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint64, vadd)
613OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, vadd)
614
615OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, vmul)
616OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, vmul)
617OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, vfmul)
618#if CV_SIMD_SCALABLE_64F
619OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, vfadd)
620OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, vfmul)
621#endif
622
623#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _TpwvecM2, suffix, wmul) \
624inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
625{ \
626 _TpwvecM2 temp = wmul(a, b, VTraits<_Tpvec>::vlanes()); \
627 c = vget_##suffix##m1(temp, 0); \
628 d = vget_##suffix##m1(temp, 1); \
629}
630
631OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8, v_uint16, vuint16m2_t, u16, vwmulu)
632OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8, v_int16, vint16m2_t, i16, vwmul)
633OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16, v_uint32, vuint32m2_t, u32, vwmulu)
634OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16, v_int32, vint32m2_t, i32, vwmul)
635OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32, v_uint64, vuint64m2_t, u64, vwmulu)
636
637inline v_int16 v_mul_hi(const v_int16& a, const v_int16& b)
638{
639 return vmulh(a, b, VTraits<v_int16>::vlanes());
640}
641inline v_uint16 v_mul_hi(const v_uint16& a, const v_uint16& b)
642{
643 return vmulhu(a, b, VTraits<v_uint16>::vlanes());
644}
645
647OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add_wrap, vadd)
648OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add_wrap, vadd)
649OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add_wrap, vadd)
650OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add_wrap, vadd)
651OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub_wrap, vsub)
652OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub_wrap, vsub)
653OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub_wrap, vsub)
654OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub_wrap, vsub)
655OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, mul_wrap, vmul)
656OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, mul_wrap, vmul)
657OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, mul_wrap, vmul)
658OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, mul_wrap, vmul)
659
660
661#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _clip, _wmul) \
662inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
663{ \
664 return _clip(_wmul(a, b, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()); \
665} \
666template<typename... Args> \
667inline _Tpvec v_mul(const _Tpvec& a1, const _Tpvec& a2, const Args&... va) { \
668 return v_mul(_clip(_wmul(a1, a2, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()), va...); \
669}
670
671OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8, vnclipu, vwmulu)
672OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8, vnclip, vwmul)
673OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16, vnclipu, vwmulu)
674OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16, vnclip, vwmul)
675
676
677
678#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, vl) \
679inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
680{ \
681 return vand(a, b, vl); \
682} \
683inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
684{ \
685 return vor(a, b, vl); \
686} \
687inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
688{ \
689 return vxor(a, b, vl); \
690} \
691inline _Tpvec v_not (const _Tpvec& a) \
692{ \
693 return vnot(a, vl); \
694}
695
696OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8, VTraits<v_uint8>::vlanes())
697OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8, VTraits<v_int8>::vlanes())
698OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16, VTraits<v_uint16>::vlanes())
699OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16, VTraits<v_int16>::vlanes())
700OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32, VTraits<v_uint32>::vlanes())
701OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits<v_int32>::vlanes())
702OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits<v_uint64>::vlanes())
703OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits<v_int64>::vlanes())
704
705#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(intrin) \
706inline v_float32 intrin (const v_float32& a, const v_float32& b) \
707{ \
708 return vreinterpret_f32m1(intrin(vreinterpret_i32m1(a), vreinterpret_i32m1(b))); \
709}
710OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_and)
711OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_or)
712OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_xor)
713
714inline v_float32 v_not (const v_float32& a) \
715{ \
716 return vreinterpret_f32m1(v_not(vreinterpret_i32m1(a))); \
717}
718
719#if CV_SIMD_SCALABLE_64F
720#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(intrin) \
721inline v_float64 intrin (const v_float64& a, const v_float64& b) \
722{ \
723 return vreinterpret_f64m1(intrin(vreinterpret_i64m1(a), vreinterpret_i64m1(b))); \
724}
725OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_and)
726OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_or)
727OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_xor)
728
729inline v_float64 v_not (const v_float64& a) \
730{ \
731 return vreinterpret_f64m1(v_not(vreinterpret_i64m1(a))); \
732}
733#endif
734
735
737/* Usage
7381. v_shl<N>(vec);
7392. v_shl(vec, N); // instead of vec << N, when N is non-constant.
740*/
741
742#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \
743template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
744{ \
745 return _Tpvec(vsll(a, uint8_t(n), vl)); \
746} \
747template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
748{ \
749 return _Tpvec(vsrl(a, uint8_t(n), vl)); \
750}
751
752#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \
753template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
754{ \
755 return _Tpvec(vsll(a, uint8_t(n), vl)); \
756} \
757template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
758{ \
759 return _Tpvec(vsra(a, uint8_t(n), vl)); \
760}
761
762OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16, VTraits<v_uint16>::vlanes())
763OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32, VTraits<v_uint32>::vlanes())
764OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64, VTraits<v_uint64>::vlanes())
765OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16, VTraits<v_int16>::vlanes())
766OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32, VTraits<v_int32>::vlanes())
767OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64, VTraits<v_int64>::vlanes())
768
770#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix) \
771inline _Tpvec v_##op(const _Tpvec& a, const _Tpvec& b) \
772{ \
773 size_t VLEN = VTraits<_Tpvec>::vlanes(); \
774 uint64_t ones = -1; \
775 return vmerge(intrin(a, b, VLEN), vmv_v_x_##suffix##m1(0, VLEN), ones, VLEN); \
776}
777
778#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix) \
779inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \
780{ \
781 size_t VLEN = VTraits<_Tpvec>::vlanes(); \
782 union { uint64_t u; VTraits<_Tpvec>::lane_type d; } ones; \
783 ones.u = -1; \
784 auto diff = intrin(a, b, VLEN); \
785 auto z = vfmv_v_f_##suffix##m1(0, VLEN); \
786 auto res = vfmerge(diff, z, ones.d, VLEN); \
787 return _Tpvec(res); \
788} //TODO
789
790#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix) \
791OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix) \
792OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix) \
793OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmsltu, suffix) \
794OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgtu, suffix) \
795OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsleu, suffix) \
796OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsgeu, suffix)
797
798#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix) \
799OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix) \
800OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix) \
801OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmslt, suffix) \
802OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgt, suffix) \
803OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsle, suffix) \
804OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsge, suffix)
805
806#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix) \
807OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, eq, vmfeq, suffix) \
808OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ne, vmfne, suffix) \
809OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, lt, vmflt, suffix) \
810OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, gt, vmfgt, suffix) \
811OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, le, vmfle, suffix) \
812OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ge, vmfge, suffix)
813
814
815OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8, u8)
816OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16, u16)
817OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32, u32)
818OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64, u64)
819OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8, i8)
820OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16, i16)
821OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32, i32)
822OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64, i64)
823OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32, f32)
824#if CV_SIMD_SCALABLE_64F
825OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64, f64)
826#endif
827
828inline v_float32 v_not_nan(const v_float32& a)
829{ return v_eq(a, a); }
830
831#if CV_SIMD_SCALABLE_64F
832inline v_float64 v_not_nan(const v_float64& a)
833{ return v_eq(a, a); }
834#endif
835
837
838#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
839inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
840{ \
841 return intrin(a, b, vl); \
842}
843
844OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_min, vminu, VTraits<v_uint8>::vlanes())
845OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_max, vmaxu, VTraits<v_uint8>::vlanes())
846OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_min, vmin, VTraits<v_int8>::vlanes())
847OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_max, vmax, VTraits<v_int8>::vlanes())
848OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_min, vminu, VTraits<v_uint16>::vlanes())
849OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_max, vmaxu, VTraits<v_uint16>::vlanes())
850OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_min, vmin, VTraits<v_int16>::vlanes())
851OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_max, vmax, VTraits<v_int16>::vlanes())
852OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, vminu, VTraits<v_uint32>::vlanes())
853OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, vmaxu, VTraits<v_uint32>::vlanes())
854OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits<v_int32>::vlanes())
855OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits<v_int32>::vlanes())
856OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits<v_float32>::vlanes())
857OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits<v_float32>::vlanes())
858#if CV_SIMD_SCALABLE_64F
859OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits<v_float64>::vlanes())
860OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits<v_float64>::vlanes())
861#endif
862
864#define OPENCV_HAL_IMPL_RVV_ZIP4(_Tpvec, _wTpvec, suffix, convert2u, convert) \
865inline void v_zip4(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
866 int vl = 4; \
867 _wTpvec temp = vreinterpret_##suffix##m2(convert2u( \
868 vor(vzext_vf2(convert(a0), vl), \
869 vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(convert(a1), vl)), 0, vl*2)), \
870 vl))); \
871 b0 = vget_##suffix##m1(temp, 0); \
872 b1 = vget_##suffix##m1(vrgather(temp, vadd(vid_v_u32m2(vl), 4, vl)/*{4,5,6,7} */, vl) ,0); \
873}
874
875OPENCV_HAL_IMPL_RVV_ZIP4(v_uint32, vuint32m2_t, u32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
876OPENCV_HAL_IMPL_RVV_ZIP4(v_int32, vint32m2_t, i32, vreinterpret_u32m2, vreinterpret_u32m1)
877OPENCV_HAL_IMPL_RVV_ZIP4(v_float32, vfloat32m2_t, f32, vreinterpret_u32m2, vreinterpret_u32m1)
878
879#if 0
880// this is v_zip4 and v_tranpose4x4 for scalable VLEN, costs more instruction than current 128-bit only version.
881inline void v_zip4(const v_float32& a0, const v_float32& a1, v_float32& b0, v_float32& b1) {
882 vuint64m1_t vid1 = vid_v_u64m1(VTraits<vuint64m1_t>::vlanes());
883 vuint16m1_t t1 = vreinterpret_u16m1(vid1);
884 vuint16m1_t t2 = vslide1up(t1, 0, VTraits<vuint16m1_t>::vlanes());
885 vuint16m1_t t3 = vslide1up(t2, 0, VTraits<vuint16m1_t>::vlanes());
886 vuint16m1_t t4 = vslide1up(t3, 0, VTraits<vuint16m1_t>::vlanes());
887 t1 = vor(
888 vor(t1, t2, VTraits<vuint16m1_t>::vlanes()),
889 vor(t3, t4, VTraits<vuint16m1_t>::vlanes()),
890 VTraits<vuint16m1_t>::vlanes()
891 );
892 vuint32m2_t vidx0 = vwmulu(t1, 4, VTraits<vuint32m1_t>::vlanes());
893 vidx0 = vadd(vidx0, vid_v_u32m2(VTraits<vuint32m1_t>::vlanes()), VTraits<vuint32m1_t>::vlanes());
894 vuint32m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint32m1_t>::vlanes());
895 vfloat32m2_t temp = vreinterpret_f32m2(vreinterpret_u32m2(
896 vor(vzext_vf2(vreinterpret_u32m1(a0), VTraits<vuint16m1_t>::vlanes()),
897 vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a1), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vfloat32m1_t>::vlanes()*2)),
898 VTraits<vfloat32m1_t>::vlanes())));
899 b0 = vlmul_trunc_f32m1(vrgather(temp, vidx0, VTraits<vuint16m1_t>::vlanes()));
900 b1 = vlmul_trunc_f32m1(vrgather(temp, vidx1, VTraits<vuint16m1_t>::vlanes()));
901}
902
903inline void v_transpose4x4(const v_float32& a0, const v_float32& a1, const v_float32& a2, const v_float32& a3,\
904 v_float32& b0, v_float32& b1, v_float32& b2, v_float32& b3) { \
905 vuint64m2_t vid1 = vid_v_u64m2(VTraits<vuint32m1_t>::vlanes());
906 vuint16m2_t t1 = vreinterpret_u16m2(vid1);
907 vuint16m2_t t2 = vslide1up(t1, 0, VTraits<vuint8m1_t>::vlanes());
908 vuint16m2_t t3 = vslide1up(t2, 0, VTraits<vuint8m1_t>::vlanes());
909 vuint16m2_t t4 = vslide1up(t3, 0, VTraits<vuint8m1_t>::vlanes());
910 t1 = vor(
911 vor(t1, t2, VTraits<vuint8m1_t>::vlanes()),
912 vor(t3, t4, VTraits<vuint8m1_t>::vlanes()),
913 VTraits<vuint8m1_t>::vlanes()
914 );
915 vuint16m2_t vidx0 = vmul(t1, 12, VTraits<vuint8m1_t>::vlanes());
916 vidx0 = vadd(vidx0, vid_v_u16m2(VTraits<vuint8m1_t>::vlanes()), VTraits<vuint8m1_t>::vlanes());
917 vuint16m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint8m1_t>::vlanes());
918 vuint16m2_t vidx2 = vadd(vidx0, 8, VTraits<vuint8m1_t>::vlanes());
919 vuint16m2_t vidx3 = vadd(vidx0, 12, VTraits<vuint8m1_t>::vlanes());
920 vuint32m2_t tempA = vreinterpret_u32m2( \
921 vor(vzext_vf2(vreinterpret_u32m1(a0), VTraits<vuint16m1_t>::vlanes()), \
922 vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a2), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
923 VTraits<vuint32m1_t>::vlanes())); \
924 vuint32m2_t tempB = vreinterpret_u32m2( \
925 vor(vzext_vf2(vreinterpret_u32m1(a1), VTraits<vuint16m1_t>::vlanes()), \
926 vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(a3), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
927 VTraits<vuint32m1_t>::vlanes())); \
928 vfloat32m4_t temp = vreinterpret_f32m4(vreinterpret_u32m4( \
929 vor(vzext_vf2(tempA, VTraits<vuint8m1_t>::vlanes()), \
930 vreinterpret_u64m4(vslide1up(vreinterpret_u32m4(vzext_vf2(tempB, VTraits<vuint8m1_t>::vlanes())), 0, VTraits<vuint8m1_t>::vlanes())), \
931 VTraits<vuint16m1_t>::vlanes()))); \
932 b0 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx0, VTraits<vuint8m1_t>::vlanes()));
933 b1 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx1, VTraits<vuint8m1_t>::vlanes()));
934 b2 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx2, VTraits<vuint8m1_t>::vlanes()));
935 b3 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx3, VTraits<vuint8m1_t>::vlanes()));
936}
937#endif
938
939#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, suffix) \
940inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, const _Tpvec& a2, const _Tpvec& a3, _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) { \
941 _Tpvec t0,t1,t2,t3; \
942 v_zip4(a0, a2, t0, t2); \
943 v_zip4(a1, a3, t1, t3); \
944 v_zip4(t0, t1, b0, b1); \
945 v_zip4(t2, t3, b2, b3); \
946}
947
948OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_uint32, u32)
949OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_int32, i32)
950OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_float32, f32)
951
952
953
954#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl, red) \
955inline scalartype v_reduce_sum(const _Tpvec& a) \
956{ \
957 _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
958 _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
959 res = v##red(res, a, zero, vl); \
960 return (scalartype)v_get0(res); \
961}
962OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8, v_uint16, vuint16m1_t, unsigned, u16, VTraits<v_uint8>::vlanes(), wredsumu)
963OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8, v_int16, vint16m1_t, int, i16, VTraits<v_int8>::vlanes(), wredsum)
964OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16, v_uint32, vuint32m1_t, unsigned, u32, VTraits<v_uint16>::vlanes(), wredsumu)
965OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16, v_int32, vint32m1_t, int, i32, VTraits<v_int16>::vlanes(), wredsum)
966OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32, v_uint64, vuint64m1_t, unsigned, u64, VTraits<v_uint32>::vlanes(), wredsumu)
967OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32, v_int64, vint64m1_t, int, i64, VTraits<v_int32>::vlanes(), wredsum)
968OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64, v_uint64, vuint64m1_t, uint64, u64, VTraits<v_uint64>::vlanes(), redsum)
969OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64, v_int64, vint64m1_t, int64, i64, VTraits<v_int64>::vlanes(), redsum)
970
971
972#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl) \
973inline scalartype v_reduce_sum(const _Tpvec& a) \
974{ \
975 _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
976 _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
977 res = vfredosum(res, a, zero, vl); \
978 return (scalartype)v_get0(res); \
979}
980OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
981#if CV_SIMD_SCALABLE_64F
982OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits<v_float64>::vlanes())
983#endif
984
985#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
986inline scalartype v_reduce_##func(const _Tpvec& a) \
987{ \
988 _Tpvec res = _Tpvec(v##red(a, a, a, vl)); \
989 return (scalartype)v_get0(res); \
990}
991
992OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, min, uchar, u8, VTraits<v_uint8>::vlanes(), redminu)
993OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, min, schar, i8, VTraits<v_int8>::vlanes(), redmin)
994OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, min, ushort, u16, VTraits<v_uint16>::vlanes(), redminu)
995OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, min, short, i16, VTraits<v_int16>::vlanes(), redmin)
996OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, min, unsigned, u32, VTraits<v_uint32>::vlanes(), redminu)
997OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, min, int, i32, VTraits<v_int32>::vlanes(), redmin)
998OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, min, float, f32, VTraits<v_float32>::vlanes(), fredmin)
999OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, max, uchar, u8, VTraits<v_uint8>::vlanes(), redmaxu)
1000OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, max, schar, i8, VTraits<v_int8>::vlanes(), redmax)
1001OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, max, ushort, u16, VTraits<v_uint16>::vlanes(), redmaxu)
1002OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, max, short, i16, VTraits<v_int16>::vlanes(), redmax)
1003OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, max, unsigned, u32, VTraits<v_uint32>::vlanes(), redmaxu)
1004OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, max, int, i32, VTraits<v_int32>::vlanes(), redmax)
1005OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, max, float, f32, VTraits<v_float32>::vlanes(), fredmax)
1006
1007inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b,
1008 const v_float32& c, const v_float32& d)
1009{
1010 // 0000 1111 2222 3333 ....
1011 vuint64m2_t vid1 = vid_v_u64m2(VTraits<vuint32m1_t>::vlanes());
1012 vuint16m2_t t1 = vreinterpret_u16m2(vid1);
1013 vuint16m2_t t2 = vslide1up(t1, 0, VTraits<vuint8m1_t>::vlanes());
1014 vuint16m2_t t3 = vslide1up(t2, 0, VTraits<vuint8m1_t>::vlanes());
1015 vuint16m2_t t4 = vslide1up(t3, 0, VTraits<vuint8m1_t>::vlanes());
1016 t1 = vor(
1017 vor(t1, t2, VTraits<vuint8m1_t>::vlanes()),
1018 vor(t3, t4, VTraits<vuint8m1_t>::vlanes()),
1019 VTraits<vuint8m1_t>::vlanes()
1020 );
1021
1022 // index for transpose4X4
1023 vuint16m2_t vidx0 = vmul(t1, 12, VTraits<vuint8m1_t>::vlanes());
1024 vidx0 = vadd(vidx0, vid_v_u16m2(VTraits<vuint8m1_t>::vlanes()), VTraits<vuint8m1_t>::vlanes());
1025 vuint16m2_t vidx1 = vadd(vidx0, 4, VTraits<vuint8m1_t>::vlanes());
1026 vuint16m2_t vidx2 = vadd(vidx0, 8, VTraits<vuint8m1_t>::vlanes());
1027 vuint16m2_t vidx3 = vadd(vidx0, 12, VTraits<vuint8m1_t>::vlanes());
1028
1029 // zip
1030 vuint32m2_t tempA = vreinterpret_u32m2( \
1031 vor(vzext_vf2(vreinterpret_u32m1(a), VTraits<vuint16m1_t>::vlanes()), \
1032 vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(c), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
1033 VTraits<vuint32m1_t>::vlanes())); \
1034 vuint32m2_t tempB = vreinterpret_u32m2( \
1035 vor(vzext_vf2(vreinterpret_u32m1(b), VTraits<vuint16m1_t>::vlanes()), \
1036 vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(vreinterpret_u32m1(d), VTraits<vuint16m1_t>::vlanes())), 0, VTraits<vuint16m1_t>::vlanes())), \
1037 VTraits<vuint32m1_t>::vlanes())); \
1038 vfloat32m4_t temp = vreinterpret_f32m4(vreinterpret_u32m4( \
1039 vor(vzext_vf2(tempA, VTraits<vuint8m1_t>::vlanes()), \
1040 vreinterpret_u64m4(vslide1up(vreinterpret_u32m4(vzext_vf2(tempB, VTraits<vuint8m1_t>::vlanes())), 0, VTraits<vuint8m1_t>::vlanes())), \
1041 VTraits<vuint16m1_t>::vlanes())));
1042
1043 // transpose
1044 vfloat32m1_t b0 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx0, VTraits<vuint8m1_t>::vlanes()));
1045 vfloat32m1_t b1 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx1, VTraits<vuint8m1_t>::vlanes()));
1046 vfloat32m1_t b2 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx2, VTraits<vuint8m1_t>::vlanes()));
1047 vfloat32m1_t b3 = vlmul_trunc_f32m1(vrgatherei16(temp, vidx3, VTraits<vuint8m1_t>::vlanes()));
1048
1049 // vector add
1050 v_float32 res = vfadd(
1051 vfadd(b0, b1, VTraits<vfloat32m1_t>::vlanes()),
1052 vfadd(b2, b3, VTraits<vfloat32m1_t>::vlanes()),
1053 VTraits<vfloat32m1_t>::vlanes()
1054 );
1055 return res;
1056}
1057
1059
1060inline v_float32 v_sqrt(const v_float32& x)
1061{
1062 return vfsqrt(x, VTraits<v_float32>::vlanes());
1063}
1064
1065inline v_float32 v_invsqrt(const v_float32& x)
1066{
1067 v_float32 one = v_setall_f32(1.0f);
1068 return v_div(one, v_sqrt(x));
1069}
1070
1071#if CV_SIMD_SCALABLE_64F
1072inline v_float64 v_sqrt(const v_float64& x)
1073{
1074 return vfsqrt(x, VTraits<v_float64>::vlanes());
1075}
1076
1077inline v_float64 v_invsqrt(const v_float64& x)
1078{
1079 v_float64 one = v_setall_f64(1.0f);
1080 return v_div(one, v_sqrt(x));
1081}
1082#endif
1083
1084inline v_float32 v_magnitude(const v_float32& a, const v_float32& b)
1085{
1086 v_float32 x = vfmacc(vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes());
1087 return v_sqrt(x);
1088}
1089
1090inline v_float32 v_sqr_magnitude(const v_float32& a, const v_float32& b)
1091{
1092 return v_float32(vfmacc(vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes()));
1093}
1094
1095#if CV_SIMD_SCALABLE_64F
1096inline v_float64 v_magnitude(const v_float64& a, const v_float64& b)
1097{
1098 v_float64 x = vfmacc(vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
1099 return v_sqrt(x);
1100}
1101
1102inline v_float64 v_sqr_magnitude(const v_float64& a, const v_float64& b)
1103{
1104 return vfmacc(vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
1105}
1106#endif
1107
1109
1110inline v_float32 v_fma(const v_float32& a, const v_float32& b, const v_float32& c)
1111{
1112 return vfmacc(c, a, b, VTraits<v_float32>::vlanes());
1113}
1114inline v_int32 v_fma(const v_int32& a, const v_int32& b, const v_int32& c)
1115{
1116 return vmacc(c, a, b, VTraits<v_float32>::vlanes());
1117}
1118
1119inline v_float32 v_muladd(const v_float32& a, const v_float32& b, const v_float32& c)
1120{
1121 return v_fma(a, b, c);
1122}
1123
1124inline v_int32 v_muladd(const v_int32& a, const v_int32& b, const v_int32& c)
1125{
1126 return v_fma(a, b, c);
1127}
1128
1129#if CV_SIMD_SCALABLE_64F
1130inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c)
1131{
1132 return vfmacc_vv_f64m1(c, a, b, VTraits<v_float64>::vlanes());
1133}
1134
1135inline v_float64 v_muladd(const v_float64& a, const v_float64& b, const v_float64& c)
1136{
1137 return v_fma(a, b, c);
1138}
1139#endif
1140
1142
1143#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
1144inline bool v_check_all(const _Tpvec& a) \
1145{ \
1146 return (int)vcpop(vmslt(a, 0, vl), vl) == vl; \
1147} \
1148inline bool v_check_any(const _Tpvec& a) \
1149{ \
1150 return (int)vcpop(vmslt(a, 0, vl), vl) != 0; \
1151}
1152
1153OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8, VTraits<v_int8>::vlanes())
1154OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16, VTraits<v_int16>::vlanes())
1155OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32, VTraits<v_int32>::vlanes())
1156OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64, VTraits<v_int64>::vlanes())
1157
1158
1159inline bool v_check_all(const v_uint8& a)
1160{ return v_check_all(v_reinterpret_as_s8(a)); }
1161inline bool v_check_any(const v_uint8& a)
1162{ return v_check_any(v_reinterpret_as_s8(a)); }
1163
1164inline bool v_check_all(const v_uint16& a)
1165{ return v_check_all(v_reinterpret_as_s16(a)); }
1166inline bool v_check_any(const v_uint16& a)
1167{ return v_check_any(v_reinterpret_as_s16(a)); }
1168
1169inline bool v_check_all(const v_uint32& a)
1170{ return v_check_all(v_reinterpret_as_s32(a)); }
1171inline bool v_check_any(const v_uint32& a)
1172{ return v_check_any(v_reinterpret_as_s32(a)); }
1173
1174inline bool v_check_all(const v_float32& a)
1175{ return v_check_all(v_reinterpret_as_s32(a)); }
1176inline bool v_check_any(const v_float32& a)
1177{ return v_check_any(v_reinterpret_as_s32(a)); }
1178
1179inline bool v_check_all(const v_uint64& a)
1180{ return v_check_all(v_reinterpret_as_s64(a)); }
1181inline bool v_check_any(const v_uint64& a)
1182{ return v_check_any(v_reinterpret_as_s64(a)); }
1183
1184#if CV_SIMD_SCALABLE_64F
1185inline bool v_check_all(const v_float64& a)
1186{ return v_check_all(v_reinterpret_as_s64(a)); }
1187inline bool v_check_any(const v_float64& a)
1188{ return v_check_any(v_reinterpret_as_s64(a)); }
1189#endif
1190
1192
1193#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
1194inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
1195{ \
1196 return v_sub(v_max(a, b), v_min(a, b)); \
1197}
1198
1199OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8, absdiff)
1200OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16, absdiff)
1201OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32, absdiff)
1202OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32, absdiff)
1203#if CV_SIMD_SCALABLE_64F
1204OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64, absdiff)
1205#endif
1206OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8, absdiffs)
1207OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16, absdiffs)
1208
1209#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, width) \
1210inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1211{ \
1212 return vnclipu(vreinterpret_u##width##m2(vwsub_vv(v_max(a, b), v_min(a, b), VTraits<_Tpvec>::vlanes())), 0, VTraits<_Tpvec>::vlanes()); \
1213}
1214
1215OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8, v_uint8, 16)
1216OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16, v_uint16, 32)
1217OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32, v_uint32, 64)
1218
1219#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
1220inline _Tprvec v_abs(const _Tpvec& a) \
1221{ \
1222 return v_absdiff(a, v_setzero_##suffix()); \
1223}
1224
1225OPENCV_HAL_IMPL_RVV_ABS(v_uint8, v_int8, s8)
1226OPENCV_HAL_IMPL_RVV_ABS(v_uint16, v_int16, s16)
1227OPENCV_HAL_IMPL_RVV_ABS(v_uint32, v_int32, s32)
1228OPENCV_HAL_IMPL_RVV_ABS(v_float32, v_float32, f32)
1229#if CV_SIMD_SCALABLE_64F
1230OPENCV_HAL_IMPL_RVV_ABS(v_float64, v_float64, f64)
1231#endif
1232
1233
1234#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
1235inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
1236{ \
1237 return v_reduce_sum(v_absdiff(a, b)); \
1238}
1239
1240OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8, unsigned)
1241OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8, unsigned)
1242OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16, unsigned)
1243OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16, unsigned)
1244OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32, unsigned)
1245OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32, unsigned)
1246OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32, float)
1247
1248
1249
1250#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, vl) \
1251inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1252{ \
1253 return vmerge(vmsne(mask, 0, vl), b, a, vl); \
1254}
1255
1256OPENCV_HAL_IMPL_RVV_SELECT(v_uint8, VTraits<v_uint8>::vlanes())
1257OPENCV_HAL_IMPL_RVV_SELECT(v_uint16, VTraits<v_uint16>::vlanes())
1258OPENCV_HAL_IMPL_RVV_SELECT(v_uint32, VTraits<v_uint32>::vlanes())
1259OPENCV_HAL_IMPL_RVV_SELECT(v_int8, VTraits<v_int8>::vlanes())
1260OPENCV_HAL_IMPL_RVV_SELECT(v_int16, VTraits<v_int16>::vlanes())
1261OPENCV_HAL_IMPL_RVV_SELECT(v_int32, VTraits<v_int32>::vlanes())
1262
1263inline v_float32 v_select(const v_float32& mask, const v_float32& a, const v_float32& b) \
1264{ \
1265 return vmerge(vmfne(mask, 0, VTraits<v_float32>::vlanes()), b, a, VTraits<v_float32>::vlanes()); \
1266}
1267
1268#if CV_SIMD_SCALABLE_64F
1269inline v_float64 v_select(const v_float64& mask, const v_float64& a, const v_float64& b) \
1270{ \
1271 return vmerge(vmfne(mask, 0, VTraits<v_float64>::vlanes()), b, a, VTraits<v_float64>::vlanes()); \
1272}
1273#endif
1274
1276
1277#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
1278template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1279{ \
1280 return vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \
1281} \
1282template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1283{ \
1284 return vslideup(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \
1285} \
1286template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1287{ return a; } \
1288template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1289{ \
1290 return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
1291} \
1292template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1293{ \
1294 return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
1295} \
1296template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1297{ CV_UNUSED(b); return a; }
1298
1299OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8, u8, VTraits<v_uint8>::vlanes())
1300OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8, i8, VTraits<v_int8>::vlanes())
1301OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16, u16, VTraits<v_uint16>::vlanes())
1302OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16, i16, VTraits<v_int16>::vlanes())
1303OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32, u32, VTraits<v_uint32>::vlanes())
1304OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32, i32, VTraits<v_int32>::vlanes())
1305OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64, u64, VTraits<v_uint64>::vlanes())
1306OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64, i64, VTraits<v_int64>::vlanes())
1307
1308#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
1309template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1310{ \
1311 return vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \
1312} \
1313template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1314{ \
1315 return vslideup(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \
1316} \
1317template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1318{ return a; } \
1319template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1320{ \
1321 return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
1322} \
1323template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1324{ \
1325 return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
1326} \
1327template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1328{ CV_UNUSED(b); return a; }
1329
1330OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32, f32, VTraits<v_float32>::vlanes())
1331#if CV_SIMD_SCALABLE_64F
1332OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64, VTraits<v_float64>::vlanes())
1333#endif
1334
1336inline v_float32 v_cvt_f32(const v_int32& a)
1337{
1338 return vfcvt_f_x_v_f32m1(a, VTraits<v_float32>::vlanes());
1339}
1340
1341#if CV_SIMD_SCALABLE_64F
1342inline v_float32 v_cvt_f32(const v_float64& a)
1343{
1344 return vfncvt_f(vlmul_ext_f64m2(a), VTraits<v_float64>::vlanes());
1345}
1346
1347inline v_float32 v_cvt_f32(const v_float64& a, const v_float64& b)
1348{
1349 return vfncvt_f(vset(vlmul_ext_f64m2(a),1,b), VTraits<v_float32>::vlanes());
1350}
1351
1352inline v_float64 v_cvt_f64(const v_int32& a)
1353{
1354 return vget_f64m1(vfwcvt_f(a, VTraits<v_int32>::vlanes()), 0);
1355}
1356
1357inline v_float64 v_cvt_f64_high(const v_int32& a)
1358{
1359 return vget_f64m1(vfwcvt_f(a, VTraits<v_int32>::vlanes()), 1);
1360}
1361
1362inline v_float64 v_cvt_f64(const v_float32& a)
1363{
1364 return vget_f64m1(vfwcvt_f(a, VTraits<v_float32>::vlanes()), 0);
1365}
1366
1367inline v_float64 v_cvt_f64_high(const v_float32& a)
1368{
1369 return vget_f64m1(vfwcvt_f(a, VTraits<v_float32>::vlanes()), 1);
1370}
1371
1372inline v_float64 v_cvt_f64(const v_int64& a)
1373{
1374 return vfcvt_f(a, VTraits<v_int64>::vlanes());
1375}
1376#endif
1377
1379
1380#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
1381template<int s = 0> inline _Tpvec v_broadcast_element(_Tpvec v, int i = s) \
1382{ \
1383 return v_setall_##suffix(v_extract_n(v, i)); \
1384} \
1385inline _Tpvec v_broadcast_highest(_Tpvec v) \
1386{ \
1387 return v_setall_##suffix(v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \
1388}
1389
1390OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32)
1391OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32)
1392OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32)
1393
1394
1395
1396#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, width) \
1397inline _Tpvec v_reverse(const _Tpvec& a) \
1398{ \
1399 vuint##width##m1_t vidx = vrsub(vid_v_u##width##m1(VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()-1, VTraits<_Tpvec>::vlanes()); \
1400 return vrgather(a, vidx, VTraits<_Tpvec>::vlanes()); \
1401}
1402OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8)
1403OPENCV_HAL_IMPL_RVV_REVERSE(v_int8, 8)
1404OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16, 16)
1405OPENCV_HAL_IMPL_RVV_REVERSE(v_int16, 16)
1406OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32, 32)
1407OPENCV_HAL_IMPL_RVV_REVERSE(v_int32, 32)
1408OPENCV_HAL_IMPL_RVV_REVERSE(v_float32, 32)
1409OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64, 64)
1410OPENCV_HAL_IMPL_RVV_REVERSE(v_int64, 64)
1411#if CV_SIMD_SCALABLE_64F
1412OPENCV_HAL_IMPL_RVV_REVERSE(v_float64, 64)
1413#endif
1414
1416
1417#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \
1418inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1419{ \
1420 _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
1421 b0 = vget_##suffix##m1(temp, 0); \
1422 b1 = vget_##suffix##m1(temp, 1); \
1423} \
1424inline _Tpwvec v_expand_low(const _Tpvec& a) \
1425{ \
1426 _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
1427 return vget_##suffix##m1(temp, 0); \
1428} \
1429inline _Tpwvec v_expand_high(const _Tpvec& a) \
1430{ \
1431 _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
1432 return vget_##suffix##m1(temp, 1); \
1433} \
1434inline _Tpwvec v_load_expand(const _Tp* ptr) \
1435{ \
1436 return cvt(vle##width##_v_##suffix2##mf2(ptr, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
1437}
1438
1439OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, vwcvtu_x)
1440OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m2_t, v_int8, 8, i16, i8, vwcvt_x)
1441OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m2_t, v_uint16, 16, u32, u16, vwcvtu_x)
1442OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m2_t, v_int16, 16, i32, i16, vwcvt_x)
1443OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m2_t, v_uint32, 32, u64, u32, vwcvtu_x)
1444OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m2_t, v_int32, 32, i64, i32, vwcvt_x)
1445
1446inline v_uint32 v_load_expand_q(const uchar* ptr)
1447{
1448 return vwcvtu_x(vwcvtu_x(vle8_v_u8mf4(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
1449}
1450
1451inline v_int32 v_load_expand_q(const schar* ptr)
1452{
1453 return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
1454}
1455
1456#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, hwidth, hsuffix, suffix, rshr, shr) \
1457inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
1458{ \
1459 return shr(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, VTraits<_Tpvec>::vlanes()); \
1460} \
1461inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
1462{ \
1463 vse##hwidth##_v_##hsuffix##mf2(ptr, shr(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
1464} \
1465template<int n = 0> inline \
1466_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b, int N = n) \
1467{ \
1468 return rshr(vset(vlmul_ext_##suffix##m2(a), 1, b), N, VTraits<_Tpvec>::vlanes()); \
1469} \
1470template<int n = 0> inline \
1471void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a, int N = n) \
1472{ \
1473 vse##hwidth##_v_##hsuffix##mf2(ptr, rshr(a, N, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
1474}
1475
1476OPENCV_HAL_IMPL_RVV_PACK(v_uint8, uchar, v_uint16, 8, u8, u16, vnclipu, vnclipu)
1477OPENCV_HAL_IMPL_RVV_PACK(v_int8, schar, v_int16, 8, i8, i16, vnclip, vnclip)
1478OPENCV_HAL_IMPL_RVV_PACK(v_uint16, ushort, v_uint32, 16, u16, u32, vnclipu, vnclipu)
1479OPENCV_HAL_IMPL_RVV_PACK(v_int16, short, v_int32, 16, i16, i32, vnclip, vnclip)
1480OPENCV_HAL_IMPL_RVV_PACK(v_uint32, unsigned, v_uint64, 32, u32, u64, vnclipu, vnsrl)
1481OPENCV_HAL_IMPL_RVV_PACK(v_int32, int, v_int64, 32, i32, i64, vnclip, vnsra)
1482
1483#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
1484inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
1485{ \
1486 return vnclipu(cast(vmax(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, vl)), 0, vl); \
1487} \
1488inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
1489{ \
1490 vse##hwidth##_v_##hsuffix##mf2(ptr, vnclipu(vreinterpret_u##width##m1(vmax(a, 0, vl)), 0, vl), hvl); \
1491} \
1492template<int N = 0> inline \
1493_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b, int n = N) \
1494{ \
1495 return vnclipu(cast(vmax(vset(vlmul_ext_##suffix##m2(a), 1, b), 0, vl)), n, vl); \
1496} \
1497template<int N = 0> inline \
1498void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a, int n = N) \
1499{ \
1500 vse##hwidth##_v_##hsuffix##mf2(ptr, vnclipu(vreinterpret_u##width##m1(vmax(a, 0, vl)), n, vl), hvl); \
1501}
1502
1503OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, VTraits<v_int16>::vlanes(), VTraits<v_uint8>::vlanes())
1504OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, VTraits<v_int32>::vlanes(), VTraits<v_uint16>::vlanes())
1505
1506
1507/* void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
1508 a0 = {A1 A2 A3 A4}
1509 a1 = {B1 B2 B3 B4}
1510---------------
1511 {A1 B1 A2 B2} and {A3 B3 A4 B4}
1512*/
1513
1514#define OPENCV_HAL_IMPL_RVV_ZIP(_Tpvec, _wTpvec, suffix, width, width2, convert2um2, convert2um1) \
1515inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
1516 _wTpvec temp = vreinterpret_##suffix##m2(convert2um2( \
1517 vor(vzext_vf2(convert2um1(a0), VTraits<_Tpvec>::vlanes()*2), \
1518 vreinterpret_u##width2##m2(vslide1up(vreinterpret_u##width##m2(vzext_vf2(convert2um1(a1), VTraits<_Tpvec>::vlanes()*2)), 0, VTraits<_Tpvec>::vlanes()*2)), \
1519 VTraits<_Tpvec>::vlanes()))); \
1520 b0 = vget_##suffix##m1(temp, 0); \
1521 b1 = vget_##suffix##m1(temp, 1); \
1522}
1523OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m2_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1524OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m2_t, i8, 8, 16, vreinterpret_u8m2, vreinterpret_u8m1)
1525OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m2_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1526OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m2_t, i16, 16, 32, vreinterpret_u16m2, vreinterpret_u16m1)
1527OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1528OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
1529OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
1530
1531#if CV_SIMD_SCALABLE_64F
1532inline void v_zip(const v_float64& a0, const v_float64& a1, v_float64& b0, v_float64& b1) { \
1533 vuint16mf4_t idx0 = vid_v_u16mf4(VTraits<v_float64>::vlanes());
1534 vuint16mf4_t idx1 = vadd(idx0, VTraits<v_float64>::vlanes(), VTraits<v_float64>::vlanes());
1535 vuint16mf2_t idx = vreinterpret_u16mf2(( \
1536 vor(vzext_vf2(idx0, VTraits<v_float64>::vlanes()), \
1537 vreinterpret_u32mf2(vslide1up(vreinterpret_u16mf2(vzext_vf2(idx1, VTraits<v_float64>::vlanes())), 0, VTraits<v_uint32>::vlanes())), \
1538 VTraits<v_uint32>::vlanes())));
1539#if 0
1540 vfloat64m2_t temp = __riscv_vcreate_v_f64m1_f64m2(a0, a1);
1541#else // TODO: clean up when RVV Intrinsic is frozen.
1542 vfloat64m2_t temp = vlmul_ext_f64m2(a0);
1543 temp = vset(temp, 1, a1);
1544#endif
1545 temp = vrgatherei16(temp, idx, VTraits<v_float64>::vlanes()*2);
1546 b0 = vget_f64m1(temp, 0); \
1547 b1 = vget_f64m1(temp, 1); \
1548}
1549#endif
1550
1551#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \
1552inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1553{ \
1554 return vslideup(a, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes());\
1555} \
1556inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1557{ \
1558 return vslideup( \
1559 vslidedown(a, a, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
1560 vslidedown(b, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
1561 VTraits<_Tpvec>::vlanes()/2, \
1562 VTraits<_Tpvec>::vlanes()); \
1563} \
1564inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1565{ \
1566 c = v_combine_low(a, b); \
1567 d = v_combine_high(a, b); \
1568}
1569
1570OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint8, 8)
1571OPENCV_HAL_IMPL_RVV_UNPACKS(v_int8, 8)
1572OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint16, 16)
1573OPENCV_HAL_IMPL_RVV_UNPACKS(v_int16, 16)
1574OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint32, 32)
1575OPENCV_HAL_IMPL_RVV_UNPACKS(v_int32, 32)
1576OPENCV_HAL_IMPL_RVV_UNPACKS(v_float32, 32)
1577#if CV_SIMD_SCALABLE_64F
1578OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64)
1579#endif
1580
1581#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width, hwidth, vl) \
1582inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
1583{ \
1584 a = vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
1585 b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
1586}\
1587inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
1588{ \
1589 a = vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
1590 b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
1591 c = vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
1592} \
1593inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
1594 v_##_Tpvec& c, v_##_Tpvec& d) \
1595{ \
1596 \
1597 a = vlse##width##_v_##suffix##m1(ptr , sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
1598 b = vlse##width##_v_##suffix##m1(ptr+1, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
1599 c = vlse##width##_v_##suffix##m1(ptr+2, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
1600 d = vlse##width##_v_##suffix##m1(ptr+3, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
1601} \
1602inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1603 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1604{ \
1605 vsse##width(ptr, sizeof(_Tp)*2, a, VTraits<v_##_Tpvec>::vlanes()); \
1606 vsse##width(ptr+1, sizeof(_Tp)*2, b, VTraits<v_##_Tpvec>::vlanes()); \
1607} \
1608inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1609 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
1610{ \
1611 vsse##width(ptr, sizeof(_Tp)*3, a, VTraits<v_##_Tpvec>::vlanes()); \
1612 vsse##width(ptr+1, sizeof(_Tp)*3, b, VTraits<v_##_Tpvec>::vlanes()); \
1613 vsse##width(ptr+2, sizeof(_Tp)*3, c, VTraits<v_##_Tpvec>::vlanes()); \
1614} \
1615inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
1616 const v_##_Tpvec& c, const v_##_Tpvec& d, \
1617 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
1618{ \
1619 vsse##width(ptr, sizeof(_Tp)*4, a, VTraits<v_##_Tpvec>::vlanes()); \
1620 vsse##width(ptr+1, sizeof(_Tp)*4, b, VTraits<v_##_Tpvec>::vlanes()); \
1621 vsse##width(ptr+2, sizeof(_Tp)*4, c, VTraits<v_##_Tpvec>::vlanes()); \
1622 vsse##width(ptr+3, sizeof(_Tp)*4, d, VTraits<v_##_Tpvec>::vlanes()); \
1623}
1624
1625OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, u8, 8, 4, VTraits<v_uint8>::vlanes())
1626OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, i8, 8, 4, VTraits<v_int8>::vlanes())
1627OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, u16, 16, 8, VTraits<v_uint16>::vlanes())
1628OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, i16, 16, 8, VTraits<v_int16>::vlanes())
1629OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, u32, 32, 16, VTraits<v_uint32>::vlanes())
1630OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, i32, 32, 16, VTraits<v_int32>::vlanes())
1631OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, f32, 32, 16, VTraits<v_float32>::vlanes())
1632OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, u64, 64, 32, VTraits<v_uint64>::vlanes())
1633OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, i64, 64, 32, VTraits<v_int64>::vlanes())
1634#if CV_SIMD_SCALABLE_64F
1635OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, f64, 64, 32, VTraits<v_float64>::vlanes())
1636#endif
1637
1638static uint64_t idx_interleave_pairs[] = { \
1639 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x1715161413111210, 0x1f1d1e1c1b191a18, \
1640 0x2725262423212220, 0x2f2d2e2c2b292a28, 0x3735363433313230, 0x3f3d3e3c3b393a38, \
1641 0x4745464443414240, 0x4f4d4e4c4b494a48, 0x5755565453515250, 0x5f5d5e5c5b595a58, \
1642 0x6765666463616260, 0x6f6d6e6c6b696a68, 0x7775767473717270, 0x7f7d7e7c7b797a78};
1643
1644static uint64_t idx_interleave_quads[] = { \
1645 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x1713161215111410, 0x1f1b1e1a1d191c18, \
1646 0x2723262225212420, 0x2f2b2e2a2d292c28, 0x3733363235313430, 0x3f3b3e3a3d393c38, \
1647 0x4743464245414440, 0x4f4b4e4a4d494c48, 0x5753565255515450, 0x5f5b5e5a5d595c58, \
1648 0x6763666265616460, 0x6f6b6e6a6d696c68, 0x7773767275717470, 0x7f7b7e7a7d797c78};
1649
1650#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(_Tpvec, func) \
1651inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
1652 CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
1653 vuint8m1_t vidx = vundefined_u8m1();\
1654 vidx = vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)); \
1655 return vrgather(vec, vidx, VTraits<v_uint8>::vlanes()); \
1656}
1657OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, pairs)
1658OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, pairs)
1659OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, quads)
1660OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, quads)
1661
1662#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(_Tpvec, width, vzext_vfx, func) \
1663inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
1664 CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
1665 vuint##width##m1_t vidx = vundefined_u##width##m1();\
1666 vidx = vget_u##width##m1(vzext_vfx(vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)), VTraits<v_uint8>::vlanes()), 0); \
1667 return vrgather(vec, vidx, VTraits<_Tpvec>::vlanes()); \
1668}
1669
1670OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, pairs)
1671OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, pairs)
1672OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, pairs)
1673OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, pairs)
1674OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, pairs)
1675
1676OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, quads)
1677OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, quads)
1678OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, quads)
1679OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, quads)
1680OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, quads)
1681
1682
1683static const unsigned char popCountTable[256] =
1684{
1685 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1686 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1687 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1688 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1689 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1690 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1691 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1692 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1693 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1694 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1695 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1696 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1697 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1698 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1699 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1700 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
1701};
1702#define OPENCV_HAL_IMPL_RVV_HADD(_Tpvec, _Tpvec2, _Tm2, width, width2, suffix, add) \
1703static inline _Tpvec2 v_hadd(_Tpvec a) { \
1704 vuint##width2##m1_t oneX2 = vmv_v_x_u##width2##m1(1, VTraits<v_uint##width2>::vlanes()); \
1705 vuint##width##m1_t one = vreinterpret_u##width##m1(oneX2); \
1706 _Tm2 res = add(a, vslide1down(a, 0, VTraits<v_uint##width>::vlanes()), VTraits<v_uint##width>::vlanes()); \
1707 return vget_##suffix##m1(vcompress(vmseq(one, 1, VTraits<v_uint##width>::vlanes()), res, res, VTraits<v_uint##width>::vlanes()), 0); \
1708}
1709OPENCV_HAL_IMPL_RVV_HADD(v_uint8, v_uint16, vuint16m2_t, 8, 16, u16, vwaddu_vv)
1710OPENCV_HAL_IMPL_RVV_HADD(v_uint16, v_uint32, vuint32m2_t, 16, 32, u32, vwaddu_vv)
1711OPENCV_HAL_IMPL_RVV_HADD(v_uint32, v_uint64, vuint64m2_t, 32, 64, u64, vwaddu_vv)
1712OPENCV_HAL_IMPL_RVV_HADD(v_int8, v_int16, vint16m2_t, 8, 16, i16, vwadd_vv)
1713OPENCV_HAL_IMPL_RVV_HADD(v_int16, v_int32, vint32m2_t, 16, 32, i32, vwadd_vv)
1714OPENCV_HAL_IMPL_RVV_HADD(v_int32, v_int64, vint64m2_t, 32, 64, i64, vwadd_vv)
1715
1716OPENCV_HAL_IMPL_RVV_HADD(vint32m2_t, v_int32, vint32m2_t, 16, 32, i32, vadd)
1717OPENCV_HAL_IMPL_RVV_HADD(vint64m2_t, v_int64, vint64m2_t, 32, 64, i64, vadd)
1718
1719inline v_uint8 v_popcount(const v_uint8& a)
1720{
1721 return vloxei8(popCountTable, a, VTraits<v_uint8>::vlanes());
1722}
1723inline v_uint16 v_popcount(const v_uint16& a)
1724{
1725 return v_hadd(v_popcount(vreinterpret_u8m1(a)));
1726}
1727inline v_uint32 v_popcount(const v_uint32& a)
1728{
1729 return v_hadd(v_hadd(v_popcount(vreinterpret_u8m1(a))));
1730}
1731inline v_uint64 v_popcount(const v_uint64& a)
1732{
1733 return v_hadd(v_hadd(v_hadd(v_popcount(vreinterpret_u8m1(a)))));
1734}
1735
1736inline v_uint8 v_popcount(const v_int8& a)
1737{
1738 return v_popcount(v_abs(a));\
1739}
1740inline v_uint16 v_popcount(const v_int16& a)
1741{
1742 return v_popcount(v_abs(a));\
1743}
1744inline v_uint32 v_popcount(const v_int32& a)
1745{
1746 return v_popcount(v_abs(a));\
1747}
1748inline v_uint64 v_popcount(const v_int64& a)
1749{
1750 // max(0 - a) is used, since v_abs does not support 64-bit integers.
1751 return v_popcount(v_reinterpret_as_u64(vmax(a, v_sub(v_setzero_s64(), a), VTraits<v_int64>::vlanes())));
1752}
1753
1754
1756#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec) \
1757inline int v_signmask(const _Tpvec& a) \
1758{ \
1759 uint8_t ans[4] = {0}; \
1760 vsm(ans, vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
1761 return *(reinterpret_cast<int*>(ans)) & (((__int128_t)1 << VTraits<_Tpvec>::vlanes()) - 1); \
1762} \
1763inline int v_scan_forward(const _Tpvec& a) \
1764{ \
1765 return (int)vfirst(vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
1766}
1767
1768OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8)
1769OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16)
1770OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32)
1771OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64)
1772
1773inline int64 v_signmask(const v_uint8& a)
1774{ return v_signmask(v_reinterpret_as_s8(a)); }
1775inline int64 v_signmask(const v_uint16& a)
1776{ return v_signmask(v_reinterpret_as_s16(a)); }
1777inline int v_signmask(const v_uint32& a)
1778{ return v_signmask(v_reinterpret_as_s32(a)); }
1779inline int v_signmask(const v_float32& a)
1780{ return v_signmask(v_reinterpret_as_s32(a)); }
1781inline int v_signmask(const v_uint64& a)
1782{ return v_signmask(v_reinterpret_as_s64(a)); }
1783#if CV_SIMD_SCALABLE_64F
1784inline int v_signmask(const v_float64& a)
1785{ return v_signmask(v_reinterpret_as_s64(a)); }
1786#endif
1787
1789inline int v_scan_forward(const v_uint8& a)
1790{ return v_scan_forward(v_reinterpret_as_s8(a)); }
1791inline int v_scan_forward(const v_uint16& a)
1792{ return v_scan_forward(v_reinterpret_as_s16(a)); }
1793inline int v_scan_forward(const v_uint32& a)
1794{ return v_scan_forward(v_reinterpret_as_s32(a)); }
1795inline int v_scan_forward(const v_float32& a)
1796{ return v_scan_forward(v_reinterpret_as_s32(a)); }
1797inline int v_scan_forward(const v_uint64& a)
1798{ return v_scan_forward(v_reinterpret_as_s64(a)); }
1799#if CV_SIMD_SCALABLE_64F
1800inline int v_scan_forward(const v_float64& a)
1801{ return v_scan_forward(v_reinterpret_as_s64(a)); }
1802#endif
1803
1805// {A0, A1, A2, A3, B0, B1, B2, B3, C0 ...} --> {A0, A1, A2, B0, B1, B2, C0 ...}
1806// mask: {0,0,0,1, ...} -> {T,T,T,F, ...}
1807#define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, v_trunc) \
1808inline _Tpvec v_pack_triplets(const _Tpvec& vec) { \
1809 size_t vl = __cv_rvv_e8m1_nlanes; \
1810 vuint32m1_t one = vmv_v_x_u32m1(1, __cv_rvv_e32m1_nlanes); \
1811 vuint8m1_t zero = vmv_v_x_u8m1(0, vl); \
1812 vuint8m1_t mask = vreinterpret_u8m1(one); \
1813 return vcompress(vmseq(v_trunc(vslideup(zero, mask, 3, vl)), 0, vl), vec, vec, VTraits<_Tpvec>::vlanes()); \
1814}
1815
1816OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8, OPENCV_HAL_NOP)
1817OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8, OPENCV_HAL_NOP)
1818OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16, vlmul_trunc_u8mf2)
1819OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16, vlmul_trunc_u8mf2)
1820OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32, vlmul_trunc_u8mf4)
1821OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32, vlmul_trunc_u8mf4)
1822OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32, vlmul_trunc_u8mf4)
1823OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint64, vlmul_trunc_u8mf8)
1824OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int64, vlmul_trunc_u8mf8)
1825#if CV_SIMD_SCALABLE_64F
1826OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float64, vlmul_trunc_u8mf8)
1827#endif
1828
1829
1831
1832#if defined(__riscv_zfh) && __riscv_zfh
1833inline v_float32 v_load_expand(const hfloat* ptr)
1834{
1835 return vfwcvt_f(vle16_v_f16mf2((_Float16*)ptr, VTraits<v_float32>::vlanes()) ,VTraits<v_float32>::vlanes());;
1836}
1837
1838inline void v_pack_store(hfloat* ptr, const v_float32& v)
1839{
1840 vse16_v_f16mf2((_Float16*)ptr, vfncvt_f_f_w_f16mf2(v, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
1841}
1842#else
1843inline v_float32 v_load_expand(const hfloat* ptr)
1844{
1845 float buf[32];
1846 for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) buf[i] = (float)ptr[i];
1847 return v_load(buf);
1848}
1849
1850inline void v_pack_store(hfloat* ptr, const v_float32& v)
1851{
1852 float buf[32];
1853 v_store(buf, v);
1854 for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) ptr[i] = hfloat(buf[i]);
1855}
1856#endif
1858inline v_int32 v_round(const v_float32& a)
1859{
1860 // return vfcvt_x(vfadd(a, 1e-6, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
1861 return vfcvt_x(a, VTraits<v_float32>::vlanes());
1862}
1863
1864inline v_int32 v_floor(const v_float32& a)
1865{
1866 return vfcvt_x(vfsub(a, 0.5f - 1e-5, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
1867 // return vfcvt_x(a, VTraits<v_float32>::vlanes());
1868}
1869
1870inline v_int32 v_ceil(const v_float32& a)
1871{
1872 return vfcvt_x(vfadd(a, 0.5f - 1e-5, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
1873}
1874
1875inline v_int32 v_trunc(const v_float32& a)
1876{
1877 return vfcvt_rtz_x(a, VTraits<v_float32>::vlanes());
1878}
1879#if CV_SIMD_SCALABLE_64F
1880inline v_int32 v_round(const v_float64& a)
1881{
1882 return vfncvt_x(vlmul_ext_f64m2(a), VTraits<v_float32>::vlanes());
1883}
1884
1885inline v_int32 v_round(const v_float64& a, const v_float64& b)
1886{
1887 // return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
1888 // Fix https://github.com/opencv/opencv/issues/24746
1889 return vfncvt_x(vset(vlmul_ext_f64m2(a), 1, b), VTraits<v_float32>::vlanes());
1890}
1891
1892inline v_int32 v_floor(const v_float64& a)
1893{
1894 return vfncvt_x(vlmul_ext_f64m2(vfsub(a, 0.5f - 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
1895}
1896
1897inline v_int32 v_ceil(const v_float64& a)
1898{
1899 return vfncvt_x(vlmul_ext_f64m2(vfadd(a, 0.5f - 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
1900}
1901
1902inline v_int32 v_trunc(const v_float64& a)
1903{
1904 return vfncvt_rtz_x(vlmul_ext_f64m2(a), VTraits<v_float32>::vlanes());
1905}
1906#endif
1907
1909
1910// 16 >> 32
1911inline v_int32 v_dotprod(const v_int16& a, const v_int16& b)
1912{
1913 vint32m2_t temp1 = vwmul(a, b, VTraits<v_int16>::vlanes());
1914 return v_hadd(temp1);
1915}
1916
1917inline v_int32 v_dotprod(const v_int16& a, const v_int16& b, const v_int32& c)
1918{
1919 vint32m2_t temp1 = vwmul(a, b, VTraits<v_int16>::vlanes());
1920 return vadd(v_hadd(temp1), c, VTraits<v_int32>::vlanes());
1921}
1922
1923// 32 >> 64
1924inline v_int64 v_dotprod(const v_int32& a, const v_int32& b)
1925{
1926 vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
1927 vuint32m1_t one32 = vreinterpret_u32m1(one64); \
1928 vbool32_t mask = vmseq(one32, 1, VTraits<v_uint32>::vlanes()); \
1929 vint64m2_t temp1 = vwmul(a, b, VTraits<v_int32>::vlanes()); \
1930 vint64m2_t temp2 = vslide1down(temp1, 0, VTraits<v_int32>::vlanes());
1931 vint64m2_t res = vadd(temp1, temp2, VTraits<v_int32>::vlanes());
1932 res = vcompress(mask, res, res, VTraits<v_int32>::vlanes()); \
1933 return vlmul_trunc_i64m1(res); \
1934}
1935inline v_int64 v_dotprod(const v_int32& a, const v_int32& b, const v_int64& c)
1936{
1937 vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
1938 vuint32m1_t one32 = vreinterpret_u32m1(one64); \
1939 vbool32_t mask = vmseq(one32, 1, VTraits<v_uint32>::vlanes()); \
1940 vint64m2_t temp1 = vwmul(a, b, VTraits<v_int32>::vlanes()); \
1941 vint64m2_t temp2 = vslide1down(temp1, 0, VTraits<v_int32>::vlanes());
1942 vint64m2_t res = vadd(temp1, temp2, VTraits<v_int32>::vlanes());
1943 res = vcompress(mask, res, res, VTraits<v_int32>::vlanes()); \
1944 return vadd(vlmul_trunc_i64m1(res), c, VTraits<v_int64>::vlanes()); \
1945}
1946
1947// 8 >> 32
1948inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b)
1949{
1950 vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
1951 vuint8m1_t one8 = vreinterpret_u8m1(one32); \
1952 vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
1953 vuint16m2_t t0 = vwmulu(a, b, VTraits<v_uint8>::vlanes()); \
1954 vuint16m2_t t1= vslide1down(t0, 0, VTraits<v_uint8>::vlanes());
1955 vuint16m2_t t2= vslide1down(t1, 0, VTraits<v_uint8>::vlanes());
1956 vuint16m2_t t3= vslide1down(t2, 0, VTraits<v_uint8>::vlanes());
1957 vuint32m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint8>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint8>::vlanes()), VTraits<v_uint8>::vlanes());
1958 res = vcompress(mask, res, res, VTraits<v_uint8>::vlanes()); \
1959 return vlmul_trunc_u32m1(res);
1960}
1961
1962inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b,
1963 const v_uint32& c)
1964{
1965 vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
1966 vuint8m1_t one8 = vreinterpret_u8m1(one32); \
1967 vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
1968 vuint16m2_t t0 = vwmulu(a, b, VTraits<v_uint8>::vlanes()); \
1969 vuint16m2_t t1= vslide1down(t0, 0, VTraits<v_uint8>::vlanes());
1970 vuint16m2_t t2= vslide1down(t1, 0, VTraits<v_uint8>::vlanes());
1971 vuint16m2_t t3= vslide1down(t2, 0, VTraits<v_uint8>::vlanes());
1972 vuint32m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint8>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint8>::vlanes()), VTraits<v_uint8>::vlanes());
1973 res = vcompress(mask, res, res, VTraits<v_uint8>::vlanes()); \
1974 return vadd(vlmul_trunc_u32m1(res), c, VTraits<v_uint8>::vlanes());
1975}
1976
1977inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b)
1978{
1979 vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
1980 vuint8m1_t one8 = vreinterpret_u8m1(one32); \
1981 vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
1982 vint16m2_t t0 = vwmul(a, b, VTraits<v_int8>::vlanes()); \
1983 vint16m2_t t1= vslide1down(t0, 0, VTraits<v_int8>::vlanes());
1984 vint16m2_t t2= vslide1down(t1, 0, VTraits<v_int8>::vlanes());
1985 vint16m2_t t3= vslide1down(t2, 0, VTraits<v_int8>::vlanes());
1986 vint32m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int8>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int8>::vlanes()), VTraits<v_int8>::vlanes());
1987 res = vcompress(mask, res, res, VTraits<v_int8>::vlanes()); \
1988 return vlmul_trunc_i32m1(res);
1989}
1990
1991inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b,
1992 const v_int32& c)
1993{
1994 vuint32m1_t one32 = vmv_v_x_u32m1(1, VTraits<v_uint32>::vlanes()); \
1995 vuint8m1_t one8 = vreinterpret_u8m1(one32); \
1996 vbool8_t mask = vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
1997 vint16m2_t t0 = vwmul(a, b, VTraits<v_int8>::vlanes()); \
1998 vint16m2_t t1= vslide1down(t0, 0, VTraits<v_int8>::vlanes());
1999 vint16m2_t t2= vslide1down(t1, 0, VTraits<v_int8>::vlanes());
2000 vint16m2_t t3= vslide1down(t2, 0, VTraits<v_int8>::vlanes());
2001 vint32m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int8>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int8>::vlanes()), VTraits<v_int8>::vlanes());
2002 res = vcompress(mask, res, res, VTraits<v_int8>::vlanes()); \
2003 return vadd(vlmul_trunc_i32m1(res), c, VTraits<v_int8>::vlanes());
2004}
2005
2006
2007// // 16 >> 64
2008inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b)
2009{
2010 vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
2011 vuint16m1_t one16 = vreinterpret_u16m1(one64); \
2012 vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
2013 vuint32m2_t t0 = vwmulu(a, b, VTraits<v_uint16>::vlanes()); \
2014 vuint32m2_t t1= vslide1down(t0, 0, VTraits<v_uint16>::vlanes());
2015 vuint32m2_t t2= vslide1down(t1, 0, VTraits<v_uint16>::vlanes());
2016 vuint32m2_t t3= vslide1down(t2, 0, VTraits<v_uint16>::vlanes());
2017 vuint64m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint16>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint16>::vlanes()), VTraits<v_uint16>::vlanes());
2018 res = vcompress(mask, res, res, VTraits<v_uint16>::vlanes()); \
2019 return vlmul_trunc_u64m1(res);
2020}
2021inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b, const v_uint64& c)
2022{
2023 vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
2024 vuint16m1_t one16 = vreinterpret_u16m1(one64); \
2025 vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
2026 vuint32m2_t t0 = vwmulu(a, b, VTraits<v_uint16>::vlanes()); \
2027 vuint32m2_t t1= vslide1down(t0, 0, VTraits<v_uint16>::vlanes());
2028 vuint32m2_t t2= vslide1down(t1, 0, VTraits<v_uint16>::vlanes());
2029 vuint32m2_t t3= vslide1down(t2, 0, VTraits<v_uint16>::vlanes());
2030 vuint64m4_t res = vadd(vwaddu_vv(t2, t3, VTraits<v_uint16>::vlanes()), vwaddu_vv(t0, t1, VTraits<v_uint16>::vlanes()), VTraits<v_uint16>::vlanes());
2031 res = vcompress(mask, res, res, VTraits<v_uint16>::vlanes()); \
2032 return vadd(vlmul_trunc_u64m1(res), c, VTraits<v_uint16>::vlanes());
2033}
2034
2035inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b)
2036{
2037 vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
2038 vuint16m1_t one16 = vreinterpret_u16m1(one64); \
2039 vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
2040 vint32m2_t t0 = vwmul(a, b, VTraits<v_int16>::vlanes()); \
2041 vint32m2_t t1= vslide1down(t0, 0, VTraits<v_int16>::vlanes());
2042 vint32m2_t t2= vslide1down(t1, 0, VTraits<v_int16>::vlanes());
2043 vint32m2_t t3= vslide1down(t2, 0, VTraits<v_int16>::vlanes());
2044 vint64m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int16>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int16>::vlanes()), VTraits<v_int16>::vlanes());
2045 res = vcompress(mask, res, res, VTraits<v_int16>::vlanes()); \
2046 return vlmul_trunc_i64m1(res);
2047}
2048inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b,
2049 const v_int64& c)
2050{
2051 vuint64m1_t one64 = vmv_v_x_u64m1(1, VTraits<v_uint64>::vlanes()); \
2052 vuint16m1_t one16 = vreinterpret_u16m1(one64); \
2053 vbool16_t mask = vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
2054 vint32m2_t t0 = vwmul(a, b, VTraits<v_int16>::vlanes()); \
2055 vint32m2_t t1= vslide1down(t0, 0, VTraits<v_int16>::vlanes());
2056 vint32m2_t t2= vslide1down(t1, 0, VTraits<v_int16>::vlanes());
2057 vint32m2_t t3= vslide1down(t2, 0, VTraits<v_int16>::vlanes());
2058 vint64m4_t res = vadd(vwadd_vv(t2, t3, VTraits<v_int16>::vlanes()), vwadd_vv(t0, t1, VTraits<v_int16>::vlanes()), VTraits<v_int16>::vlanes());
2059 res = vcompress(mask, res, res, VTraits<v_int16>::vlanes()); \
2060 return vadd(vlmul_trunc_i64m1(res), c, VTraits<v_int16>::vlanes());
2061}
2062
2063// // 32 >> 64f
2064#if CV_SIMD_SCALABLE_64F
2065inline v_float64 v_dotprod_expand(const v_int32& a, const v_int32& b)
2066{ return v_cvt_f64(v_dotprod(a, b)); }
2067inline v_float64 v_dotprod_expand(const v_int32& a, const v_int32& b,
2068 const v_float64& c)
2069{ return v_add(v_dotprod_expand(a, b) , c); }
2070#endif
2071
2073// 16 >> 32
2074inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b)
2075{
2076 v_int32 zero = v_setzero_s32();
2077 return vredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero, VTraits<v_int16>::vlanes());
2078}
2079inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b, const v_int32& c)
2080{
2081 v_int32 zero = v_setzero_s32();
2082 return vredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), vredsum(zero, c, zero, VTraits<v_int32>::vlanes()), VTraits<v_int16>::vlanes());
2083}
2084
2085// 32 >> 64
2086inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b)
2087{
2088 v_int64 zero = v_setzero_s64();
2089 return vredsum(zero, vwmul(a, b, VTraits<v_int32>::vlanes()), zero, VTraits<v_int32>::vlanes());
2090}
2091inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b, const v_int64& c)
2092{
2093 v_int64 zero = v_setzero_s64();
2094 return vadd(vredsum(zero, vwmul(a, b, VTraits<v_int32>::vlanes()), zero, VTraits<v_int32>::vlanes()) , vredsum(zero, c, zero, VTraits<v_int64>::vlanes()), VTraits<v_int64>::vlanes());
2095}
2096
2097
2098// 8 >> 32
2099inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b)
2100{
2101 v_uint32 zero = v_setzero_u32();
2102 return vwredsumu(zero, vwmulu(a, b, VTraits<v_uint8>::vlanes()), zero, VTraits<v_uint8>::vlanes());
2103}
2104inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b, const v_uint32& c)
2105{
2106 v_uint32 zero = v_setzero_u32();
2107 return vadd(vwredsumu(zero, vwmulu(a, b, VTraits<v_uint8>::vlanes()), zero, VTraits<v_uint8>::vlanes()) , vredsum(zero, c, zero, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
2108}
2109inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b)
2110{
2111 v_int32 zero = v_setzero_s32();
2112 return vwredsum(zero, vwmul(a, b, VTraits<v_int8>::vlanes()), zero, VTraits<v_int8>::vlanes());
2113}
2114inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b, const v_int32& c)
2115{
2116 v_int32 zero = v_setzero_s32();
2117 return vadd(vwredsum(zero, vwmul(a, b, VTraits<v_int8>::vlanes()), zero, VTraits<v_int8>::vlanes()) , vredsum(zero, c, zero, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
2118}
2119
2120// 16 >> 64
2121inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b)
2122{
2123 v_uint64 zero = v_setzero_u64();
2124 return vwredsumu(zero, vwmulu(a, b, VTraits<v_uint16>::vlanes()), zero, VTraits<v_uint16>::vlanes());
2125}
2126inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b, const v_uint64& c)
2127{
2128 v_uint64 zero = v_setzero_u64();
2129 return vadd(vwredsumu(zero, vwmulu(a, b, VTraits<v_uint16>::vlanes()), zero, VTraits<v_uint16>::vlanes()), vredsum(zero, c, zero, VTraits<v_uint64>::vlanes()), VTraits<v_uint64>::vlanes());
2130}
2131inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b)
2132{
2133 v_int64 zero = v_setzero_s64();
2134 return vwredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero, VTraits<v_int16>::vlanes());
2135}
2136inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b, const v_int64& c)
2137{
2138 v_int64 zero = v_setzero_s64();
2139 return vadd(vwredsum(zero, vwmul(a, b, VTraits<v_int16>::vlanes()), zero, VTraits<v_int16>::vlanes()), vredsum(zero, c, zero, VTraits<v_int64>::vlanes()), VTraits<v_int64>::vlanes());
2140}
2141
2142// 32 >> 64f
2143#if CV_SIMD_SCALABLE_64F
2144inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b)
2145{ return v_cvt_f64(v_dotprod_fast(a, b)); }
2146inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b, const v_float64& c)
2147{ return v_add(v_dotprod_expand_fast(a, b) , c); }
2148#endif
2149
2150// TODO: only 128 bit now.
2151inline v_float32 v_matmul(const v_float32& v, const v_float32& m0,
2152 const v_float32& m1, const v_float32& m2,
2153 const v_float32& m3)
2154{
2155 vfloat32m1_t res;
2156 res = vfmul_vf_f32m1(m0, v_extract_n(v, 0), VTraits<v_float32>::vlanes());
2157 res = vfmacc_vf_f32m1(res, v_extract_n(v, 1), m1, VTraits<v_float32>::vlanes());
2158 res = vfmacc_vf_f32m1(res, v_extract_n(v, 2), m2, VTraits<v_float32>::vlanes());
2159 res = vfmacc_vf_f32m1(res, v_extract_n(v, 3), m3, VTraits<v_float32>::vlanes());
2160 return res;
2161}
2162
2163// TODO: only 128 bit now.
2164inline v_float32 v_matmuladd(const v_float32& v, const v_float32& m0,
2165 const v_float32& m1, const v_float32& m2,
2166 const v_float32& a)
2167{
2168 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n(v,0), VTraits<v_float32>::vlanes());
2169 res = vfmacc_vf_f32m1(res, v_extract_n(v,1), m1, VTraits<v_float32>::vlanes());
2170 res = vfmacc_vf_f32m1(res, v_extract_n(v,2), m2, VTraits<v_float32>::vlanes());
2171 return vfadd(res, a, VTraits<v_float32>::vlanes());
2172}
2173
2174inline void v_cleanup() {}
2175
2176CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2177
2179
2180} //namespace cv
2181
2182#endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst)
Calculates the per-element absolute difference between two arrays or between an array and a scalar.
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
int int idx1
Definition core_c.h:654
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
int idx0
Definition core_c.h:652
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition intrin_cpp.hpp:1554
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition intrin_cpp.hpp:2190
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition intrin_cpp.hpp:1033
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition intrin_cpp.hpp:1020
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition intrin_cpp.hpp:1451
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition intrin_cpp.hpp:1584
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition intrin_cpp.hpp:1961
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition intrin_cpp.hpp:2761
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
CvArr CvArr * temp
Definition imgproc_c.h:329
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441