8 #ifndef OPENCV_HAL_INTRIN_RVV_HPP
9 #define OPENCV_HAL_INTRIN_RVV_HPP
16 #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>10999
17 #include "intrin_rvv_010_compat_non-policy.hpp"
18 #include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
25 #ifdef __THEAD_VERSION__
26 # if __riscv_v == 7000
28 # define CV_RVV_THEAD_0_7
37 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
40 #ifndef CV_RVV_THEAD_0_7
41 # define CV_SIMD128_64F 1
43 # define CV_SIMD128_64F 0
54 vuint8mf2_t(
const uchar* ptr)
56 for (
int i = 0; i < 8; ++i)
66 vint8mf2_t(
const schar* ptr)
68 for (
int i = 0; i < 8; ++i)
78 vuint16mf2_t(
const ushort* ptr)
80 for (
int i = 0; i < 4; ++i)
90 vint16mf2_t(
const short* ptr)
92 for (
int i = 0; i < 4; ++i)
100 unsigned val[2] = {0};
102 vuint32mf2_t(
const unsigned* ptr)
112 vint32mf2_t(
const int* ptr)
122 vfloat32mf2_t(
const float* ptr)
132 vuint64mf2_t(
const uint64* ptr)
141 vint64mf2_t(
const int64* ptr)
150 vfloat64mf2_t(
const double* ptr)
159 vuint8mf4_t(
const uchar* ptr)
161 for (
int i = 0; i < 4; ++i)
171 vint8mf4_t(
const schar* ptr)
173 for (
int i = 0; i < 4; ++i)
180 #define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
181 inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr, size_t vl) \
184 return _Tpvec(ptr); \
186 inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v, size_t vl) \
189 for (int i = 0; i < n; ++i) \
195 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
196 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
197 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
198 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
199 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
200 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
201 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
202 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
203 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
204 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
207 #define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
208 inline _Tpwvec wcvt (_Tpvec v, size_t vl) \
211 for (int i = 0; i < n; ++i) \
213 tmp[i] = (_wTp)v.val[i]; \
215 return vle##width##_v_##suffix##m1(tmp, vl); \
218 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t,
ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
219 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t,
short, vwcvt_x_x_v_i16m1, i16, 16, 8)
220 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t,
unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
221 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t,
int, vwcvt_x_x_v_i32m1, i32, 32, 4)
222 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t,
uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
223 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t,
int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
225 inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base,
size_t vl)
228 return vuint8mf4_t(base);
230 inline vint8mf4_t vle8_v_i8mf4 (
const int8_t *base,
size_t vl)
233 return vint8mf4_t(base);
236 inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src,
size_t vl)
239 for (
int i = 0; i < 4; ++i)
241 tmp[i] = (
ushort)src.val[i];
243 return vle16_v_u16mf2(tmp, vl);
245 inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src,
size_t vl)
248 for (
int i = 0; i < 4; ++i)
250 tmp[i] = (short)src.val[i];
252 return vle16_v_i16mf2(tmp, vl);
261 typedef uchar lane_type;
262 enum { nlanes = 16 };
267 vse8_v_u8m1(val, v, nlanes);
272 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
273 for (
int i = 0; i < nlanes; ++i)
278 operator vuint8m1_t()
const
280 return vle8_v_u8m1(val, nlanes);
292 typedef schar lane_type;
293 enum { nlanes = 16 };
298 vse8_v_i8m1(val, v, nlanes);
303 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
304 for (
int i = 0; i < nlanes; ++i)
309 operator vint8m1_t()
const
311 return vle8_v_i8m1(val, nlanes);
329 vse16_v_u16m1(val, v, nlanes);
333 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
334 for (
int i = 0; i < nlanes; ++i)
339 operator vuint16m1_t()
const
341 return vle16_v_u16m1(val, nlanes);
353 typedef short lane_type;
359 vse16_v_i16m1(val, v, nlanes);
361 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
363 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
364 for (
int i = 0; i < nlanes; ++i)
369 operator vint16m1_t()
const
371 return vle16_v_i16m1(val, nlanes);
383 typedef unsigned lane_type;
389 vse32_v_u32m1(val, v, nlanes);
391 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
393 unsigned v[] = {v0, v1, v2, v3};
394 for (
int i = 0; i < nlanes; ++i)
399 operator vuint32m1_t()
const
401 return vle32_v_u32m1(val, nlanes);
403 unsigned get0()
const
413 typedef int lane_type;
419 vse32_v_i32m1(val, v, nlanes);
421 v_int32x4(
int v0,
int v1,
int v2,
int v3)
423 int v[] = {v0, v1, v2, v3};
424 for (
int i = 0; i < nlanes; ++i)
429 operator vint32m1_t()
const
431 return vle32_v_i32m1(val, nlanes);
442 typedef float lane_type;
448 vse32_v_f32m1(val, v, nlanes);
450 v_float32x4(
float v0,
float v1,
float v2,
float v3)
452 float v[] = {v0, v1, v2, v3};
453 for (
int i = 0; i < nlanes; ++i)
458 operator vfloat32m1_t()
const
460 return vle32_v_f32m1(val, nlanes);
477 vse64_v_u64m1(val, v, nlanes);
482 for (
int i = 0; i < nlanes; ++i)
487 operator vuint64m1_t()
const
489 return vle64_v_u64m1(val, nlanes);
501 typedef int64 lane_type;
507 vse64_v_i64m1(val, v, nlanes);
511 int64 v[] = {v0, v1};
512 for (
int i = 0; i < nlanes; ++i)
517 operator vint64m1_t()
const
519 return vle64_v_i64m1(val, nlanes);
532 typedef double lane_type;
538 vse64_v_f64m1(val, v, nlanes);
542 double v[] = {v0, v1};
543 for (
int i = 0; i < nlanes; ++i)
548 operator vfloat64m1_t()
const
550 return vle64_v_f64m1(val, nlanes);
563 typedef uchar lane_type;
564 enum { nlanes = 16 };
574 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
575 *pval = vle8_v_u8m1(v, nlanes);
577 operator vuint8m1_t()
const
593 vuint8m1_t* pval = (vuint8m1_t*)val;
598 typedef schar lane_type;
599 enum { nlanes = 16 };
609 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
610 *pval = vle8_v_i8m1(v, nlanes);
612 operator vint8m1_t()
const
628 vint8m1_t* pval = (vint8m1_t*)val;
643 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
644 *pval = vle16_v_u16m1(v, nlanes);
646 operator vuint16m1_t()
const
663 vuint16m1_t* pval = (vuint16m1_t*)val;
668 typedef short lane_type;
676 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
678 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
679 *pval = vle16_v_i16m1(v, nlanes);
681 operator vint16m1_t()
const
698 vint16m1_t* pval = (vint16m1_t*)val;
703 typedef unsigned lane_type;
711 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
713 unsigned v[] = {v0, v1, v2, v3};
714 *pval = vle32_v_u32m1(v, nlanes);
716 operator vuint32m1_t()
const
720 unsigned get0()
const
733 vuint32m1_t* pval = (vuint32m1_t*)val;
738 typedef int lane_type;
746 v_int32x4(
int v0,
int v1,
int v2,
int v3)
748 int v[] = {v0, v1, v2, v3};
749 *pval = vle32_v_i32m1(v, nlanes);
751 operator vint32m1_t()
const
768 vint32m1_t* pval = (vint32m1_t*)val;
773 typedef float lane_type;
781 v_float32x4(
float v0,
float v1,
float v2,
float v3)
783 float v[] = {v0, v1, v2, v3};
784 *pval = vle32_v_f32m1(v, nlanes);
786 operator vfloat32m1_t()
const
792 return vfmv_f(*pval);
802 vfloat32m1_t* pval = (vfloat32m1_t*)val;
818 *pval = vle64_v_u64m1(v, nlanes);
820 operator vuint64m1_t()
const
837 vuint64m1_t* pval = (vuint64m1_t*)val;
842 typedef int64 lane_type;
852 int64 v[] = {v0, v1};
853 *pval = vle64_v_i64m1(v, nlanes);
855 operator vint64m1_t()
const
872 vint64m1_t* pval = (vint64m1_t*)val;
878 typedef double lane_type;
888 double v[] = {v0, v1};
889 *pval = vle64_v_f64m1(v, nlanes);
891 operator vfloat64m1_t()
const
897 return vfmv_f(*pval);
908 vfloat64m1_t* pval = (vfloat64m1_t*)val;
915 #define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
916 inline v_##_Tpvec v_setzero_##suffix1() \
918 return v_##_Tpvec(vmv_v_x_##suffix2##m1(0, vl)); \
920 inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
922 return v_##_Tpvec(vmv_v_x_##suffix2##m1(v, vl)); \
925 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16,
uchar, u8, u8, 16)
926 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16,
schar, s8, i8, 16)
927 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8,
ushort, u16, u16, 8)
928 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8,
short, s16, i16, 8)
929 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4,
unsigned, u32, u32, 4)
930 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4,
int, s32, i32, 4)
931 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2,
uint64, u64, u64, 2)
932 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2,
int64, s64, i64, 2)
934 #define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
935 inline v_##_Tpv v_setzero_##suffix() \
937 return v_##_Tpv(vfmv_v_f_##suffix##m1(0, vl)); \
939 inline v_##_Tpv v_setall_##suffix(_Tp v) \
941 return v_##_Tpv(vfmv_v_f_##suffix##m1(v, vl)); \
944 OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4,
float, f32, 4)
946 OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2,
double, f64, 2)
951 #define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
952 inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
954 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
955 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
956 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
957 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
958 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
959 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
960 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
961 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
962 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
964 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
967 #define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
968 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
970 return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
972 inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
974 return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
977 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, int8x16, u8, s8, u8, i8)
978 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, int16x8, u16, s16, u16, i16)
979 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, int32x4, u32, s32, u32, i32)
980 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, float32x4, u32, f32, u32, f32)
981 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, float32x4, s32, f32, i32, f32)
982 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, int64x2, u64, s64, u64, i64)
984 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, float64x2, u64, f64, u64, f64)
985 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64x2, float64x2, s64, f64, i64, f64)
987 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint16x8, u8, u16, u8, u16)
988 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint32x4, u8, u32, u8, u32)
989 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint64x2, u8, u64, u8, u64)
990 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint32x4, u16, u32, u16, u32)
991 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint64x2, u16, u64, u16, u64)
992 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, uint64x2, u32, u64, u32, u64)
993 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int16x8, s8, s16, i8, i16)
994 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int32x4, s8, s32, i8, i32)
995 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int64x2, s8, s64, i8, i64)
996 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int32x4, s16, s32, i16, i32)
997 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int64x2, s16, s64, i16, i64)
998 OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, int64x2, s32, s64, i32, i64)
1001 #define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
1002 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
1004 return v_##_Tpvec1(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v)));\
1006 inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
1008 return v_##_Tpvec2(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v)));\
1011 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int16x8, u8, s16, u, i, 8, 16)
1012 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int32x4, u8, s32, u, i, 8, 32)
1013 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int64x2, u8, s64, u, i, 8, 64)
1014 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int8x16, u16, s8, u, i, 16, 8)
1015 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int32x4, u16, s32, u, i, 16, 32)
1016 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int64x2, u16, s64, u, i, 16, 64)
1017 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int8x16, u32, s8, u, i, 32, 8)
1018 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int16x8, u32, s16, u, i, 32, 16)
1019 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int64x2, u32, s64, u, i, 32, 64)
1020 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int8x16, u64, s8, u, i, 64, 8)
1021 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int16x8, u64, s16, u, i, 64, 16)
1022 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int32x4, u64, s32, u, i, 64, 32)
1023 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float32x4, u8, f32, u, f, 8, 32)
1024 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float32x4, u16, f32, u, f, 16, 32)
1025 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, float32x4, u64, f32, u, f, 64, 32)
1026 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float32x4, s8, f32, i, f, 8, 32)
1027 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float32x4, s16, f32, i, f, 16, 32)
1028 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64x2, float32x4, s64, f32, i, f, 64, 32)
1030 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float64x2, u8, f64, u, f, 8, 64)
1031 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float64x2, u16, f64, u, f, 16, 64)
1032 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, float64x2, u32, f64, u, f, 32, 64)
1033 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float64x2, s8, f64, i, f, 8, 64)
1034 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float64x2, s16, f64, i, f, 16, 64)
1035 OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32x4, float64x2, s32, f64, i, f, 32, 64)
1042 return
v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v))));\
1046 return
v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v))));\
1052 #define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vmv, vl) \
1054 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1056 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
1058 template<int i> inline _Tp v_extract_n(_Tpvec v) \
1060 return _Tp(vmv(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), v, i, vl))); \
1064 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_uint8x16,
uchar, u8, vmv_x_s_u8m1_u8, 16)
1065 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_int8x16,
schar, i8, vmv_x_s_i8m1_i8, 16)
1066 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_uint16x8,
ushort, u16, vmv_x_s_u16m1_u16, 8)
1067 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_int16x8,
short, i16, vmv_x_s_i16m1_i16, 8)
1068 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_uint32x4,
uint, u32, vmv_x_s_u32m1_u32, 4)
1069 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_int32x4,
int, i32, vmv_x_s_i32m1_i32, 4)
1070 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_uint64x2,
uint64, u64, vmv_x_s_u64m1_u64, 2)
1071 OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_int64x2,
int64, i64, vmv_x_s_i64m1_i64, 2)
1073 #define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vmv, vl) \
1075 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1077 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
1079 template<int i> inline _Tp v_extract_n(_Tpvec v) \
1081 return _Tp(vmv(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), v, i, vl))); \
1084 OPENCV_HAL_IMPL_RVV_EXTRACT_FP(
v_float32x4,
float, f32, vfmv_f_s_f32m1_f32, 4)
1086 OPENCV_HAL_IMPL_RVV_EXTRACT_FP(
v_float64x2,
double, f64, vfmv_f_s_f64m1_f64, 2)
1091 #define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
1092 inline _Tpvec v_load(const _Tp* ptr) \
1094 return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
1096 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1098 return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
1100 inline _Tpvec v_load_low(const _Tp* ptr) \
1102 _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
1105 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1107 vse##width##_v_##suffix##m1(ptr, a, vl); \
1109 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1111 vse##width##_v_##suffix##m1(ptr, a, vl); \
1113 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1115 vse##width##_v_##suffix##m1(ptr, a, vl); \
1117 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
1119 vse##width##_v_##suffix##m1(ptr, a, vl); \
1121 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1123 vse##width##_v_##suffix##m1(ptr, a, hvl); \
1125 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1127 vse##width##_v_##suffix##m1(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
1130 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint8x16, vuint8m1_t,
uchar, 8, 16, 8, u8, vmv_v_x_u8m1)
1131 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int8x16, vint8m1_t,
schar, 8, 16, 8, i8, vmv_v_x_i8m1)
1132 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint16x8, vuint16m1_t,
ushort, 4, 8, 16, u16, vmv_v_x_u16m1)
1133 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int16x8, vint16m1_t,
short, 4, 8, 16, i16, vmv_v_x_i16m1)
1134 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint32x4, vuint32m1_t,
unsigned, 2, 4, 32, u32, vmv_v_x_u32m1)
1135 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int32x4, vint32m1_t,
int, 2, 4, 32, i32, vmv_v_x_i32m1)
1136 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint64x2, vuint64m1_t,
uint64, 1, 2, 64, u64, vmv_v_x_u64m1)
1137 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int64x2, vint64m1_t,
int64, 1, 2, 64, i64, vmv_v_x_i64m1)
1138 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_float32x4, vfloat32m1_t,
float, 2, 4, 32, f32, vfmv_v_f_f32m1)
1140 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_float64x2, vfloat64m1_t,
double, 1, 2, 64, f64, vfmv_v_f_f64m1)
1147 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
1148 ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
1150 return v_int8x16(vle8_v_i8m1(elems, 16));
1158 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
1160 return v_int16x8(vle16_v_i16m1(elems, 8));
1168 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
1170 return v_int32x4(vle32_v_i32m1(elems, 4));
1176 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
1188 return v_int64x2(vle64_v_i64m1(elems, 2));
1227 return v_int8x16(vle8_v_i8m1(elems, 16));
1250 return v_int8x16(vle8_v_i8m1(elems, 16));
1273 return v_int8x16(vle8_v_i8m1(elems, 16));
1292 return v_int16x8(vle16_v_i16m1(elems, 8));
1307 return v_int16x8(vle16_v_i16m1(elems, 8));
1322 return v_int16x8(vle16_v_i16m1(elems, 8));
1337 return v_int32x4(vle32_v_i32m1(elems, 4));
1348 return v_int32x4(vle32_v_i32m1(elems, 4));
1366 return v_int64x2(vle64_v_i64m1(elems, 2));
1406 tab[v_extract_n<0>(idxvec)],
1407 tab[v_extract_n<1>(idxvec)],
1408 tab[v_extract_n<2>(idxvec)],
1409 tab[v_extract_n<3>(idxvec)]
1411 return v_int32x4(vle32_v_i32m1(elems, 4));
1418 tab[v_extract_n<0>(idxvec)],
1419 tab[v_extract_n<1>(idxvec)],
1420 tab[v_extract_n<2>(idxvec)],
1421 tab[v_extract_n<3>(idxvec)]
1430 tab[v_extract_n<0>(idxvec)],
1431 tab[v_extract_n<1>(idxvec)],
1432 tab[v_extract_n<2>(idxvec)],
1433 tab[v_extract_n<3>(idxvec)]
1467 tab[v_extract_n<0>(idxvec)],
1468 tab[v_extract_n<1>(idxvec)]
1490 return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr, 16), 0, 16));
1496 unsigned ptr[16] = {0};
1501 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr, 16), 0, 16), 0, 16));
1517 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr, 16), 0, 16), 0, 16), 0, 16));
1521 #define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, vl) \
1522 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1524 return _Tpvec(intrin(a, b, vl)); \
1526 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
1528 a = _Tpvec(intrin(a, b, vl)); \
1532 OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint8x16, vsaddu_vv_u8m1, 16)
1533 OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint8x16, vssubu_vv_u8m1, 16)
1534 OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint8x16, vdivu_vv_u8m1, 16)
1535 OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int8x16, vsadd_vv_i8m1, 16)
1536 OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int8x16, vssub_vv_i8m1, 16)
1537 OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int8x16, vdiv_vv_i8m1, 16)
1538 OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint16x8, vsaddu_vv_u16m1, 8)
1539 OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint16x8, vssubu_vv_u16m1, 8)
1540 OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint16x8, vdivu_vv_u16m1, 8)
1541 OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int16x8, vsadd_vv_i16m1, 8)
1542 OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int16x8, vssub_vv_i16m1, 8)
1543 OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int16x8, vdiv_vv_i16m1, 8)
1544 OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint32x4, vadd_vv_u32m1, 4)
1545 OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint32x4, vsub_vv_u32m1, 4)
1546 OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_uint32x4, vmul_vv_u32m1, 4)
1547 OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint32x4, vdivu_vv_u32m1, 4)
1548 OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int32x4, vadd_vv_i32m1, 4)
1549 OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int32x4, vsub_vv_i32m1, 4)
1550 OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_int32x4, vmul_vv_i32m1, 4)
1551 OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int32x4, vdiv_vv_i32m1, 4)
1552 OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_float32x4, vfadd_vv_f32m1, 4)
1553 OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_float32x4, vfsub_vv_f32m1, 4)
1554 OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_float32x4, vfmul_vv_f32m1, 4)
1555 OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_float32x4, vfdiv_vv_f32m1, 4)
1556 OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint64x2, vadd_vv_u64m1, 2)
1557 OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint64x2, vsub_vv_u64m1, 2)
1558 OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_uint64x2, vmul_vv_u64m1, 2)
1559 OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint64x2, vdivu_vv_u64m1, 2)
1560 OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int64x2, vadd_vv_i64m1, 2)
1561 OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int64x2, vsub_vv_i64m1, 2)
1562 OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_int64x2, vmul_vv_i64m1, 2)
1563 OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int64x2, vdiv_vv_i64m1, 2)
1565 OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_float64x2, vfadd_vv_f64m1, 2)
1566 OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_float64x2, vfsub_vv_f64m1, 2)
1567 OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_float64x2, vfmul_vv_f64m1, 2)
1568 OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_float64x2, vfdiv_vv_f64m1, 2)
1574 #define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, vl) \
1575 OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, vl) \
1576 OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, vl) \
1577 OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, vl) \
1578 inline _Tpvec operator ~ (const _Tpvec& a) \
1580 return _Tpvec(vnot_v_##suffix##m1(a, vl)); \
1583 OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint8x16, u8, 16)
1584 OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int8x16, i8, 16)
1585 OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint16x8, u16, 8)
1586 OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int16x8, i16, 8)
1587 OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint32x4, u32, 4)
1588 OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int32x4, i32, 4)
1589 OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint64x2, u64, 2)
1590 OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int64x2, i64, 2)
1592 #define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
1593 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
1595 return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
1597 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
1599 a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
1603 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
1604 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
1605 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
1609 return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a), 4)));
1613 #define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
1614 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
1616 return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
1618 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
1620 a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
1624 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
1625 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
1626 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
1630 return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a), 2)));
1636 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
1637 inline _Tpvec operator << (const _Tpvec& a, int n) \
1639 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1641 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1643 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
1645 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1647 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1649 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1651 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
1654 #define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
1655 inline _Tpvec operator << (const _Tpvec& a, int n) \
1657 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1659 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1661 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
1663 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1665 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1667 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1669 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
1672 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint8x16, u8, 16)
1673 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint16x8, u16, 8)
1674 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint32x4, u32, 4)
1675 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint64x2, u64, 2)
1676 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int8x16, i8, 16)
1677 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int16x8, i16, 8)
1678 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int32x4, i32, 4)
1679 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int64x2, i64, 2)
1684 #define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
1685 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1687 uint64_t ones = -1; \
1688 return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl)); \
1691 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
1692 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1694 union { uint64 u; double d; } ones; ones.u = -1; \
1695 return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
1698 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width, vl) \
1699 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
1700 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
1701 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, vl) \
1702 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, vl) \
1703 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, vl) \
1704 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, vl)
1706 #define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width, vl) \
1707 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
1708 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
1709 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, vl) \
1710 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, vl) \
1711 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, vl) \
1712 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, vl)
1714 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width, vl) \
1715 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, vl) \
1716 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, vl) \
1717 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, vl) \
1718 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, vl) \
1719 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, vl) \
1720 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, vl)
1723 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint8x16, u8, 8, 16)
1724 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint16x8, u16, 16, 8)
1725 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint32x4, u32, 32, 4)
1726 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint64x2, u64, 64, 2)
1727 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int8x16, i8, 8, 16)
1728 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int16x8, i16, 16, 8)
1729 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int32x4, i32, 32, 4)
1730 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int64x2, i64, 64, 2)
1731 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(
v_float32x4, f32, 32, 4)
1733 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(
v_float64x2, f64, 64, 2)
1746 #define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
1747 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1749 return _Tpvec(intrin(a, b, vl)); \
1752 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_min, vminu_vv_u8m1, 16)
1753 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
1754 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_min, vmin_vv_i8m1, 16)
1755 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_max, vmax_vv_i8m1, 16)
1756 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_min, vminu_vv_u16m1, 8)
1757 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
1758 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_min, vmin_vv_i16m1, 8)
1759 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_max, vmax_vv_i16m1, 8)
1760 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint32x4, v_min, vminu_vv_u32m1, 4)
1761 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
1762 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int32x4, v_min, vmin_vv_i32m1, 4)
1763 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int32x4, v_max, vmax_vv_i32m1, 4)
1764 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float32x4, v_min, vfmin_vv_f32m1, 4)
1765 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float32x4, v_max, vfmax_vv_f32m1, 4)
1766 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint64x2, v_min, vminu_vv_u64m1, 2)
1767 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint64x2, v_max, vmaxu_vv_u64m1, 2)
1768 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int64x2, v_min, vmin_vv_i64m1, 2)
1769 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int64x2, v_max, vmax_vv_i64m1, 2)
1771 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float64x2, v_min, vfmin_vv_f64m1, 2)
1772 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float64x2, v_max, vfmax_vv_f64m1, 2)
1777 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
1778 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
1779 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
1780 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
1781 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
1782 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
1783 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
1784 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
1785 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
1786 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
1787 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
1788 OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
1792 #define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
1793 inline scalartype v_reduce_sum(const _Tpvec& a) \
1795 _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
1796 _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
1797 res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
1798 return (scalartype)(_wTpvec(res).get0()); \
1801 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_uint8x16,
v_uint16x8, vuint16m1_t,
unsigned, u8, u16, 16, wredsumu)
1802 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_int8x16,
v_int16x8, vint16m1_t,
int, i8, i16, 16, wredsum)
1803 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_uint16x8,
v_uint32x4, vuint32m1_t,
unsigned, u16, u32, 8, wredsumu)
1804 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_int16x8,
v_int32x4, vint32m1_t,
int, i16, i32, 8, wredsum)
1805 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_uint32x4,
v_uint64x2, vuint64m1_t,
unsigned, u32, u64, 4, wredsumu)
1806 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_int32x4,
v_int64x2, vint64m1_t,
int, i32, i64, 4, wredsum)
1810 #define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
1811 inline scalartype v_reduce_sum(const _Tpvec& a) \
1813 _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
1814 _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
1815 res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
1816 return (scalartype)(_wTpvec(res).get0()); \
1826 #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
1827 inline scalartype v_reduce_##func(const _Tpvec& a) \
1829 _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a, vl)); \
1830 return scalartype(res.get0()); \
1836 OPENCV_HAL_IMPL_RVV_REDUCE(
v_int16x8,
min,
short, i16, 8, redmin)
1837 OPENCV_HAL_IMPL_RVV_REDUCE(
v_uint32x4,
min,
unsigned, u32, 4, redminu)
1838 OPENCV_HAL_IMPL_RVV_REDUCE(
v_int32x4,
min,
int, i32, 4, redmin)
1839 OPENCV_HAL_IMPL_RVV_REDUCE(
v_float32x4,
min,
float, f32, 4, fredmin)
1843 OPENCV_HAL_IMPL_RVV_REDUCE(
v_int16x8,
max,
short, i16, 8, redmax)
1844 OPENCV_HAL_IMPL_RVV_REDUCE(
v_uint32x4,
max,
unsigned, u32, 4, redmaxu)
1845 OPENCV_HAL_IMPL_RVV_REDUCE(
v_int32x4,
max,
int, i32, 4, redmax)
1846 OPENCV_HAL_IMPL_RVV_REDUCE(
v_float32x4,
max,
float, f32, 4, fredmax)
1872 return one / v_sqrt(
x);
1884 return one / v_sqrt(
x);
1890 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
1896 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
1902 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
1908 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
1920 return v_int32x4(vmacc_vv_i32m1(c, a, b, 4));
1925 return v_fma(a, b, c);
1930 return v_fma(a, b, c);
1941 return v_fma(a, b, c);
1949 #define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
1950 inline bool v_check_all(const _Tpvec& a) \
1952 auto v0 = vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl); \
1953 v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
1954 return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) == 0; \
1956 inline bool v_check_any(const _Tpvec& a) \
1958 auto v0 = vsrl_vx_##suffix##m1(a, shift, vl); \
1959 v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
1960 return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) != 0; \
1963 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_uint8x16, u8, 7, 16)
1964 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_uint16x8, u16, 15, 8)
1965 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_uint32x4, u32, 31, 4)
1970 return (v.val[0] | v.val[1]) == 0;
1975 return (v.val[0] | v.val[1]) != 0;
2010 #define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
2011 inline bool v_check_all(const _Tpvec& a) \
2013 return vcpop(vmslt(a, 0, vl), vl) == vl; \
2015 inline bool v_check_any(const _Tpvec& a) \
2017 return vcpop(vmslt(a, 0, vl), vl) != 0; \
2020 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_int8x16, 16)
2021 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_int16x8, 8)
2022 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_int32x4, 4)
2023 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_int64x2, 2)
2060 #define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
2061 inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
2063 return v_max(a, b) - v_min(a, b); \
2073 OPENCV_HAL_IMPL_RVV_ABSDIFF(
v_int8x16, absdiffs)
2074 OPENCV_HAL_IMPL_RVV_ABSDIFF(
v_int16x8, absdiffs)
2076 #define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(ivec, uvec, itype, utype, isuf, usuf, vlen) \
2077 inline uvec v_absdiff(const ivec& a, const ivec& b) \
2079 itype max = vmax_vv_##isuf(a, b, vlen); \
2080 itype min = vmin_vv_##isuf(a, b, vlen); \
2081 return uvec(vreinterpret_v_##isuf##_##usuf(vsub_vv_##isuf(max, min, vlen))); \
2084 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(
v_int8x16,
v_uint8x16, vint8m1_t, vuint8m1_t, i8m1, u8m1, 16)
2085 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(
v_int16x8,
v_uint16x8, vint16m1_t, vuint16m1_t, i16m1, u16m1, 8)
2086 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(
v_int32x4,
v_uint32x4, vint32m1_t, vuint32m1_t, i32m1, u32m1, 4)
2088 #define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
2089 inline _Tprvec v_abs(const _Tpvec& a) \
2091 return v_absdiff(a, v_setzero_##suffix()); \
2103 #define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
2104 inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
2106 return v_reduce_sum(v_absdiff(a, b)); \
2109 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_uint8x16,
unsigned)
2110 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_int8x16,
unsigned)
2111 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_uint16x8,
unsigned)
2112 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_int16x8,
unsigned)
2113 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_uint32x4,
unsigned)
2114 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_int32x4,
unsigned)
2115 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_float32x4,
float)
2119 #define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, vl) \
2120 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
2122 return _Tpvec(merge(ne(mask, 0, vl), b, a, vl)); \
2125 OPENCV_HAL_IMPL_RVV_SELECT(
v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 16)
2126 OPENCV_HAL_IMPL_RVV_SELECT(
v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 16)
2127 OPENCV_HAL_IMPL_RVV_SELECT(
v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 8)
2128 OPENCV_HAL_IMPL_RVV_SELECT(
v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 8)
2129 OPENCV_HAL_IMPL_RVV_SELECT(
v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 4)
2130 OPENCV_HAL_IMPL_RVV_SELECT(
v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 4)
2131 OPENCV_HAL_IMPL_RVV_SELECT(
v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 4)
2133 OPENCV_HAL_IMPL_RVV_SELECT(
v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 2)
2138 #define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
2139 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
2141 return _Tpvec(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
2143 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
2145 return _Tpvec(vslideup_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
2147 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
2149 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
2151 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
2153 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
2155 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
2157 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
2158 { CV_UNUSED(b); return a; }
2160 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_uint8x16, u8, 16)
2161 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_int8x16, i8, 16)
2162 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_uint16x8, u16, 8)
2163 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_int16x8, i16, 8)
2164 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_uint32x4, u32, 4)
2165 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_int32x4, i32, 4)
2166 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_uint64x2, u64, 2)
2167 OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_int64x2, i64, 2)
2169 #define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
2170 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
2172 return _Tpvec(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
2174 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
2176 return _Tpvec(vslideup_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
2178 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
2180 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
2182 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
2184 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
2186 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
2188 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
2189 { CV_UNUSED(b); return a; }
2191 OPENCV_HAL_IMPL_RVV_ROTATE_FP(
v_float32x4, f32, 4)
2193 OPENCV_HAL_IMPL_RVV_ROTATE_FP(
v_float64x2, f64, 2)
2207 double arr[4] = {a.val[0], a.val[1], 0, 0};
2208 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2214 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2215 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2221 vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
2222 return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
2226 vfloat64m2_t
dst = vlmul_ext_v_f64m1_f64m2(a);
2227 return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(
dst, 1, b), 4));
2233 double ptr[4] = {0};
2234 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
2244 double ptr[4] = {0};
2245 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
2255 double ptr[4] = {0};
2256 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
2266 double ptr[4] = {0};
2267 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
2283 #define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
2284 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
2286 return v_setall_##suffix(v_extract_n<i>(v)); \
2289 OPENCV_HAL_IMPL_RVV_BROADCAST(
v_uint8x16, u8)
2290 OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int8x16, s8)
2291 OPENCV_HAL_IMPL_RVV_BROADCAST(
v_uint16x8, u16)
2292 OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int16x8, s16)
2293 OPENCV_HAL_IMPL_RVV_BROADCAST(
v_uint32x4, u32)
2294 OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int32x4, s32)
2295 OPENCV_HAL_IMPL_RVV_BROADCAST(
v_uint64x2, u64)
2296 OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int64x2, s64)
2304 #define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
2305 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2306 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2307 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2308 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2312 v_extract_n<0>(a0), \
2313 v_extract_n<0>(a1), \
2314 v_extract_n<0>(a2), \
2315 v_extract_n<0>(a3) \
2317 b0 = v_load(elems0); \
2320 v_extract_n<1>(a0), \
2321 v_extract_n<1>(a1), \
2322 v_extract_n<1>(a2), \
2323 v_extract_n<1>(a3) \
2325 b1 = v_load(elems1); \
2328 v_extract_n<2>(a0), \
2329 v_extract_n<2>(a1), \
2330 v_extract_n<2>(a2), \
2331 v_extract_n<2>(a3) \
2333 b2 = v_load(elems2); \
2336 v_extract_n<3>(a0), \
2337 v_extract_n<3>(a1), \
2338 v_extract_n<3>(a2), \
2339 v_extract_n<3>(a3) \
2341 b3 = v_load(elems3); \
2344 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4,
unsigned, u32)
2345 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4,
int, i32)
2346 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4,
float, f32)
2350 #define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, suffix) \
2351 inline _Tpvec v_reverse(const _Tpvec& a) \
2353 _Tp ptr[_Tpvec::nlanes] = {0}; \
2354 _Tp ptra[_Tpvec::nlanes] = {0}; \
2356 for (int i = 0; i < _Tpvec::nlanes; i++) \
2358 ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
2360 return v_load(ptr); \
2366 OPENCV_HAL_IMPL_RVV_REVERSE(
v_int16x8,
short, i16)
2367 OPENCV_HAL_IMPL_RVV_REVERSE(
v_uint32x4,
unsigned, u32)
2368 OPENCV_HAL_IMPL_RVV_REVERSE(
v_int32x4,
int, i32)
2369 OPENCV_HAL_IMPL_RVV_REVERSE(
v_float32x4,
float, f32)
2373 OPENCV_HAL_IMPL_RVV_REVERSE(
v_float64x2,
double, f64)
2378 #define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt, vl) \
2379 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2381 _Tp lptr[_Tpvec::nlanes/2] = {0}; \
2382 _Tp hptr[_Tpvec::nlanes/2] = {0}; \
2383 v_store_low(lptr, a); \
2384 v_store_high(hptr, a); \
2385 b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
2386 b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
2388 inline _Tpwvec v_expand_low(const _Tpvec& a) \
2390 _Tp lptr[_Tpvec::nlanes/2] = {0}; \
2391 v_store_low(lptr, a); \
2392 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
2394 inline _Tpwvec v_expand_high(const _Tpvec& a) \
2396 _Tp hptr[_Tpvec::nlanes/2] = {0}; \
2397 v_store_high(hptr, a); \
2398 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
2400 inline _Tpwvec v_load_expand(const _Tp* ptr) \
2402 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr, vl), vl)); \
2408 OPENCV_HAL_IMPL_RVV_EXPAND(
v_int32x4,
short,
v_int16x8, 16, i16, vwcvt_x_x_v_i32m1, 4)
2414 return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr, 4), 4), 4));
2419 return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr, 4), 4), 4));
2423 #define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, shr, hvl, vl) \
2424 inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
2426 _wTp arr[_Tpvec::nlanes] = {0}; \
2428 v_store(arr + _wTpvec::nlanes, b); \
2429 return _Tpvec(shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl)); \
2431 inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
2433 _wTp arr[_Tpvec::nlanes] = {0}; \
2435 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2436 vse##hwidth##_v_##hsuffix##m1(ptr, shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl), hvl); \
2438 template<int n> inline \
2439 _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
2441 _wTp arr[_Tpvec::nlanes] = {0}; \
2443 v_store(arr + _wTpvec::nlanes, b); \
2444 return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)); \
2446 template<int n> inline \
2447 void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
2449 _wTp arr[_Tpvec::nlanes] = {0}; \
2451 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2452 vse##hwidth##_v_##hsuffix##m1(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)), hvl); \
2455 OPENCV_HAL_IMPL_RVV_PACK(
v_uint8x16,
uchar,
v_uint16x8,
ushort, 8, 16, u8, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1, 8, 16)
2456 OPENCV_HAL_IMPL_RVV_PACK(
v_int8x16,
schar,
v_int16x8,
short, 8, 16, i8, i16, vnclip_wx_i8m1, vnclip_wx_i8m1, 8, 16)
2457 OPENCV_HAL_IMPL_RVV_PACK(
v_uint16x8,
ushort,
v_uint32x4,
unsigned, 16, 32, u16, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1, 4, 8)
2458 OPENCV_HAL_IMPL_RVV_PACK(
v_int16x8,
short,
v_int32x4,
int, 16, 32, i16, i32, vnclip_wx_i16m1, vnclip_wx_i16m1, 4, 8)
2459 OPENCV_HAL_IMPL_RVV_PACK(
v_uint32x4,
unsigned,
v_uint64x2,
uint64, 32, 64, u32, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1, 2, 4)
2460 OPENCV_HAL_IMPL_RVV_PACK(
v_int32x4,
int,
v_int64x2,
int64, 32, 64, i32, i64, vnclip_wx_i32m1, vnsra_wx_i32m1, 2, 4)
2463 #define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
2464 inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
2466 _wTp arr[_Tpvec::nlanes] = {0}; \
2468 v_store(arr + _wTpvec::nlanes, b); \
2469 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl)); \
2471 inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2473 _wTp arr[_Tpvec::nlanes] = {0}; \
2475 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2476 vse##hwidth##_v_##hsuffix##m1(ptr, rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl), hvl); \
2478 template<int n> inline \
2479 _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
2481 _wTp arr[_Tpvec::nlanes] = {0}; \
2483 v_store(arr + _wTpvec::nlanes, b); \
2484 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl)); \
2486 template<int n> inline \
2487 void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2489 _wTp arr[_Tpvec::nlanes] = {0}; \
2491 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2492 v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl))); \
2495 OPENCV_HAL_IMPL_RVV_PACK_U(
v_uint8x16,
uchar,
v_int16x8,
short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, 8, 16)
2496 OPENCV_HAL_IMPL_RVV_PACK_U(
v_uint16x8,
ushort,
v_int32x4,
int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, 4, 8)
2499 #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, suffix) \
2500 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2502 _Tp ptra0[v_##_Tpvec::nlanes] = {0}; \
2503 _Tp ptra1[v_##_Tpvec::nlanes] = {0}; \
2504 _Tp ptrb0[v_##_Tpvec::nlanes] = {0}; \
2505 _Tp ptrb1[v_##_Tpvec::nlanes] = {0}; \
2506 v_store(ptra0, a0); \
2507 v_store(ptra1, a1); \
2509 for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
2511 ptrb0[i*2] = ptra0[i]; \
2512 ptrb0[i*2+1] = ptra1[i]; \
2514 for( ; i < v_##_Tpvec::nlanes; i++ ) \
2516 ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
2517 ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
2519 b0 = v_load(ptrb0); \
2520 b1 = v_load(ptrb1); \
2522 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2524 _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
2525 _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2526 v_store_low(ptra, a); \
2527 v_store_low(ptrb, b); \
2528 return v_load_halves(ptra, ptrb); \
2530 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2532 _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
2533 _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2534 v_store_high(ptra, a); \
2535 v_store_high(ptrb, b); \
2536 return v_load_halves(ptra, ptrb); \
2538 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2540 c = v_combine_low(a, b); \
2541 d = v_combine_high(a, b); \
2544 OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16,
uchar, u8)
2545 OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16,
schar, i8)
2546 OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8,
ushort, u16)
2547 OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8,
short, i16)
2548 OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4,
unsigned, u32)
2549 OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4,
int, i32)
2550 OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4,
float, f32)
2552 OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2,
double, f64)
2556 #define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp) \
2557 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2559 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2560 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2562 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2564 ptra[i] = ptr[i2]; \
2565 ptrb[i] = ptr[i2+1]; \
2570 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2572 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2573 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2574 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2576 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2578 ptra[i] = ptr[i3]; \
2579 ptrb[i] = ptr[i3+1]; \
2580 ptrc[i] = ptr[i3+2]; \
2586 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2587 v_##_Tpvec& c, v_##_Tpvec& d) \
2589 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2590 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2591 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2592 _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
2594 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2596 ptra[i] = ptr[i4]; \
2597 ptrb[i] = ptr[i4+1]; \
2598 ptrc[i] = ptr[i4+2]; \
2599 ptrd[i] = ptr[i4+3]; \
2606 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2607 hal::StoreMode =hal::STORE_UNALIGNED) \
2610 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2611 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2614 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2616 ptr[i2] = ptra[i]; \
2617 ptr[i2+1] = ptrb[i]; \
2620 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2621 const v_##_Tpvec& c, hal::StoreMode =hal::STORE_UNALIGNED) \
2624 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2625 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2626 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2630 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2632 ptr[i3] = ptra[i]; \
2633 ptr[i3+1] = ptrb[i]; \
2634 ptr[i3+2] = ptrc[i]; \
2637 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2638 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2639 hal::StoreMode =hal::STORE_UNALIGNED ) \
2642 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2643 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2644 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2645 _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
2650 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2652 ptr[i4] = ptra[i]; \
2653 ptr[i4+1] = ptrb[i]; \
2654 ptr[i4+2] = ptrc[i]; \
2655 ptr[i4+3] = ptrd[i]; \
2658 inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
2660 _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
2661 _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
2662 v_store(ptrvec, vec); \
2663 for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
2665 ptr[4*i ] = ptrvec[4*i ]; \
2666 ptr[4*i+1] = ptrvec[4*i+2]; \
2667 ptr[4*i+2] = ptrvec[4*i+1]; \
2668 ptr[4*i+3] = ptrvec[4*i+3]; \
2670 return v_load(ptr); \
2672 inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
2674 _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
2675 _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
2676 v_store(ptrvec, vec); \
2677 for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
2679 ptr[8*i ] = ptrvec[8*i ]; \
2680 ptr[8*i+1] = ptrvec[8*i+4]; \
2681 ptr[8*i+2] = ptrvec[8*i+1]; \
2682 ptr[8*i+3] = ptrvec[8*i+5]; \
2683 ptr[8*i+4] = ptrvec[8*i+2]; \
2684 ptr[8*i+5] = ptrvec[8*i+6]; \
2685 ptr[8*i+6] = ptrvec[8*i+3]; \
2686 ptr[8*i+7] = ptrvec[8*i+7]; \
2688 return v_load(ptr); \
2691 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16,
uchar)
2692 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16,
schar)
2693 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8,
ushort)
2694 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8,
short)
2695 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4,
unsigned)
2696 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4,
int)
2697 OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4,
float)
2698 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2,
uint64)
2699 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2,
int64)
2701 OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2,
double)
2706 static const unsigned char popCountTable[] =
2708 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2709 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2710 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2711 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2712 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2713 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2714 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2715 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2716 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2717 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2718 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2719 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2720 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2721 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2722 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2723 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2726 #define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
2727 inline _rTpvec v_popcount(const _Tpvec& a) \
2729 uchar ptra[16] = {0}; \
2730 v_store(ptra, v_reinterpret_as_u8(a)); \
2731 _rTp ptr[_Tpvec::nlanes] = {0}; \
2732 v_store(ptr, v_setzero_##suffix()); \
2733 for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
2734 ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
2735 return v_load(ptr); \
2750 #define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, vl, shift) \
2751 inline int v_signmask(const _Tpvec& a) \
2754 _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift, vl)); \
2755 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
2756 mask |= (int)(tmp.val[i]) << i; \
2762 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_uint32x4,
unsigned, u32, 4, 31)
2766 {
return v_signmask(v_reinterpret_as_u8(a)); }
2768 {
return v_signmask(v_reinterpret_as_u16(a)); }
2770 {
return v_signmask(v_reinterpret_as_u32(a)); }
2772 {
return v_signmask(v_reinterpret_as_u32(a)); }
2774 {
return v_signmask(v_reinterpret_as_u64(a)); }
2777 {
return v_signmask(v_reinterpret_as_u64(a)); }
2781 #define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, width, vl) \
2782 inline int v_signmask(const _Tpvec& a) \
2784 uint8_t ans[16] = {0};\
2785 vsm(ans, vmslt(a, 0, vl), vl);\
2786 return reinterpret_cast<int*>(ans)[0] & ((1 << (vl)) - 1);\
2789 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_int8x16, 8, 16)
2790 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_int16x8, 16, 8)
2791 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_int32x4, 32, 4)
2792 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_int64x2, 64, 2)
2795 {
return v_signmask(v_reinterpret_as_s8(a)); }
2797 {
return v_signmask(v_reinterpret_as_s16(a)); }
2799 {
return v_signmask(v_reinterpret_as_s32(a)); }
2801 {
return v_signmask(v_reinterpret_as_s32(a)); }
2803 {
return v_signmask(v_reinterpret_as_s64(a)); }
2806 {
return v_signmask(v_reinterpret_as_s64(a)); }
2813 #define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
2814 inline int v_scan_forward(const _Tpvec& a) \
2816 _Tp ptr[_Tpvec::nlanes] = {0}; \
2817 v_store(ptr, v_reinterpret_as_##suffix(a)); \
2818 for (int i = 0; i < _Tpvec::nlanes; i++) \
2819 if(int(ptr[i]) < 0) \
2827 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_int16x8,
short, s16)
2828 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_uint32x4,
unsigned, u32)
2829 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_int32x4,
int, s32)
2830 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_float32x4,
float, f32)
2834 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_float64x2,
double, f64)
2841 const uint64 ptr[2] = {0x0908060504020100, 0xFFFFFF0F0E0D0C0A};
2845 v_reinterpret_as_u8(vec),
2846 v_reinterpret_as_u8(
flags),
2851 return v_reinterpret_as_u8(
v_pack_triplets(v_reinterpret_as_s8(vec)));
2856 const uint64 ptr[2] = {0x0908050403020100, 0xFFFF0F0E0D0C0B0A};
2860 v_reinterpret_as_u8(vec),
2861 v_reinterpret_as_u8(
flags),
2866 return v_reinterpret_as_u16(
v_pack_triplets(v_reinterpret_as_s16(vec)));
2878 return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
2883 vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
2890 for(
int i = 0; i < N; i++ ) buf[i] = (
float)ptr[i];
2899 for(
int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
2907 return v_int32x4(vfcvt_x_f_v_i32m1(a, 4));
2914 return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
2921 return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
2926 #ifndef CV_RVV_THEAD_0_7
2927 return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a, 4));
2929 const int old_round =
fesetround(FE_TOWARDZERO);
2930 vint32m1_t val = vfcvt_x_f_v_i32m1(a, 4);
2939 double arr[4] = {a.val[0], a.val[1], 0, 0};
2940 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2941 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2946 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2947 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2948 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2953 double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
2954 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2955 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2960 double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
2961 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2962 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2967 double arr[4] = {a.val[0], a.val[1], 0, 0};
2968 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2969 #ifndef CV_RVV_THEAD_0_7
2970 return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp, 4));
2972 const int old_round =
fesetround(FE_TOWARDZERO);
2973 vint32m1_t val = vfncvt_x_f_w_i32m1(tmp, 4);
2982 vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
2983 return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
2988 vfloat64m2_t
dst = vlmul_ext_v_f64m1_f64m2(a);
2989 return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(
dst, 1, b), 4));
2994 vfloat64m2_t
dst = vfmv_v_f_f64m2(0, 4);
2995 dst = vset_v_f64m1_f64m2(
dst, 0, a);
2996 dst = vfsub_vf_f64m2(
dst, 0.5, 2);
3002 vfloat64m2_t
dst = vfmv_v_f_f64m2(0, 4);
3003 dst = vset_v_f64m1_f64m2(
dst, 0, a);
3004 dst = vfadd_vf_f64m2(
dst, 0.5, 2);
3010 vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
3011 return v_int32x4(vfncvt_rtz_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
3024 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3032 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3042 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3050 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3058 unsigned ptr[16] = {0};
3060 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3062 return t1 + t2 + t3 + t4;
3067 unsigned ptr[16] = {0};
3069 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3071 return t1 + t2 + t3 + t4 + c;
3078 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3080 return t1 + t2 + t3 + t4;
3087 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3089 return t1 + t2 + t3 + t4 + c;
3097 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3099 return t1 + t2 + t3 + t4;
3105 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3107 return t1 + t2 + t3 + t4 + c;
3114 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3116 return t1 + t2 + t3 + t4;
3123 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3125 return t1 + t2 + t3 + t4 + c;
3143 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3151 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3161 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3169 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3179 unsigned ptr[16] = {0};
3180 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3185 return t1 + t2 + t3 + t4;
3189 unsigned ptr[16] = {0};
3190 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3195 return t1 + t2 + t3 + t4 + c;
3200 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3205 return t1 + t2 + t3 + t4;
3210 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3215 return t1 + t2 + t3 + t4 + c;
3222 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3227 return t1 + t2 + t3 + t4;
3232 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3237 return t1 + t2 + t3 + t4 + c;
3242 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3247 return t1 + t2 + t3 + t4;
3252 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3257 return t1 + t2 + t3 + t4 + c;
3273 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
3274 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
3275 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
3276 res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3, 4);
3284 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
3285 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
3286 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
3290 #define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width, vl, hvl) \
3291 inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
3293 _Tpw ptr[_Tpwvec::nlanes*2] = {0}; \
3294 vse##width##_v_##suffix##m2(ptr, wmul(a, b, vl), vl); \
3295 c = _Tpwvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
3296 d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes, hvl)); \
3300 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(
v_int8x16,
v_int16x8,
short, i16, vwmul_vv_i16m2, 16, 16, 8)
3301 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(
v_uint16x8,
v_uint32x4,
unsigned, u32, vwmulu_vv_u32m2, 32, 8, 4)
3302 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(
v_int16x8,
v_int32x4,
int, i32, vwmul_vv_i32m2, 32, 8, 4)
3308 return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b, 8), 16, 8));
3312 return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b, 8), 16, 8));
3318 #define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
3319 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
3322 v_mul_expand(a, b, c, d); \
3323 return v_pack(c, d); \
3325 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
3339 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
InputArrayOfArrays Size InputOutputArray InputOutputArray OutputArrayOfArrays OutputArrayOfArrays OutputArray OutputArray OutputArray int flags
Definition: calib3d.hpp:1617
CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst)
Calculates the per-element absolute difference between two arrays or between an array and a scalar.
const int * idx
Definition: core_c.h:668
CvArr * arr
Definition: core_c.h:1247
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition: intrin_cpp.hpp:1584
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition: intrin_cpp.hpp:2190
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition: intrin_cpp.hpp:1020
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition: intrin_cpp.hpp:1033
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_halves(const _Tp *loptr, const _Tp *hiptr)
Load register contents from two memory blocks.
Definition: intrin_cpp.hpp:1781
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition: intrin_cpp.hpp:2043
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
OutputArray dst
Definition: imgproc.hpp:3564
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437