8#ifndef OPENCV_HAL_INTRIN_RVV_HPP
9#define OPENCV_HAL_INTRIN_RVV_HPP
16#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>10999
17#include "intrin_rvv_010_compat_non-policy.hpp"
18#include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
25#ifdef __THEAD_VERSION__
28# define CV_RVV_THEAD_0_7
37CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
40#ifndef CV_RVV_THEAD_0_7
41# define CV_SIMD128_64F 1
43# define CV_SIMD128_64F 0
54 vuint8mf2_t(
const uchar* ptr)
56 for (
int i = 0; i < 8; ++i)
66 vint8mf2_t(
const schar* ptr)
68 for (
int i = 0; i < 8; ++i)
78 vuint16mf2_t(
const ushort* ptr)
80 for (
int i = 0; i < 4; ++i)
90 vint16mf2_t(
const short* ptr)
92 for (
int i = 0; i < 4; ++i)
100 unsigned val[2] = {0};
102 vuint32mf2_t(
const unsigned* ptr)
112 vint32mf2_t(
const int* ptr)
122 vfloat32mf2_t(
const float* ptr)
132 vuint64mf2_t(
const uint64* ptr)
141 vint64mf2_t(
const int64* ptr)
150 vfloat64mf2_t(
const double* ptr)
159 vuint8mf4_t(
const uchar* ptr)
161 for (
int i = 0; i < 4; ++i)
171 vint8mf4_t(
const schar* ptr)
173 for (
int i = 0; i < 4; ++i)
180#define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
181inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr, size_t vl) \
184 return _Tpvec(ptr); \
186inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v, size_t vl) \
189 for (int i = 0; i < n; ++i) \
195OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
196OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
197OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
198OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
199OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
200OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
201OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
202OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
203OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
204OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
207#define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
208inline _Tpwvec wcvt (_Tpvec v, size_t vl) \
211 for (int i = 0; i < n; ++i) \
213 tmp[i] = (_wTp)v.val[i]; \
215 return vle##width##_v_##suffix##m1(tmp, vl); \
218OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t,
ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
219OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t,
short, vwcvt_x_x_v_i16m1, i16, 16, 8)
220OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t,
unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
221OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t,
int, vwcvt_x_x_v_i32m1, i32, 32, 4)
222OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t,
uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
223OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t,
int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
225inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base,
size_t vl)
228 return vuint8mf4_t(base);
230inline vint8mf4_t vle8_v_i8mf4 (
const int8_t *base,
size_t vl)
233 return vint8mf4_t(base);
236inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src,
size_t vl)
239 for (
int i = 0; i < 4; ++i)
241 tmp[i] = (
ushort)src.val[i];
243 return vle16_v_u16mf2(tmp, vl);
245inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src,
size_t vl)
248 for (
int i = 0; i < 4; ++i)
250 tmp[i] = (short)src.val[i];
252 return vle16_v_i16mf2(tmp, vl);
261 typedef uchar lane_type;
262 enum { nlanes = 16 };
267 vse8_v_u8m1(val, v, nlanes);
272 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
273 for (
int i = 0; i < nlanes; ++i)
278 operator vuint8m1_t()
const
280 return vle8_v_u8m1(val, nlanes);
292 typedef schar lane_type;
293 enum { nlanes = 16 };
298 vse8_v_i8m1(val, v, nlanes);
303 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
304 for (
int i = 0; i < nlanes; ++i)
309 operator vint8m1_t()
const
311 return vle8_v_i8m1(val, nlanes);
329 vse16_v_u16m1(val, v, nlanes);
333 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
334 for (
int i = 0; i < nlanes; ++i)
339 operator vuint16m1_t()
const
341 return vle16_v_u16m1(val, nlanes);
353 typedef short lane_type;
359 vse16_v_i16m1(val, v, nlanes);
361 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
363 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
364 for (
int i = 0; i < nlanes; ++i)
369 operator vint16m1_t()
const
371 return vle16_v_i16m1(val, nlanes);
383 typedef unsigned lane_type;
389 vse32_v_u32m1(val, v, nlanes);
391 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
393 unsigned v[] = {v0, v1, v2, v3};
394 for (
int i = 0; i < nlanes; ++i)
399 operator vuint32m1_t()
const
401 return vle32_v_u32m1(val, nlanes);
403 unsigned get0()
const
413 typedef int lane_type;
419 vse32_v_i32m1(val, v, nlanes);
421 v_int32x4(
int v0,
int v1,
int v2,
int v3)
423 int v[] = {v0, v1, v2, v3};
424 for (
int i = 0; i < nlanes; ++i)
429 operator vint32m1_t()
const
431 return vle32_v_i32m1(val, nlanes);
442 typedef float lane_type;
448 vse32_v_f32m1(val, v, nlanes);
450 v_float32x4(
float v0,
float v1,
float v2,
float v3)
452 float v[] = {v0, v1, v2, v3};
453 for (
int i = 0; i < nlanes; ++i)
458 operator vfloat32m1_t()
const
460 return vle32_v_f32m1(val, nlanes);
477 vse64_v_u64m1(val, v, nlanes);
482 for (
int i = 0; i < nlanes; ++i)
487 operator vuint64m1_t()
const
489 return vle64_v_u64m1(val, nlanes);
501 typedef int64 lane_type;
507 vse64_v_i64m1(val, v, nlanes);
511 int64 v[] = {v0, v1};
512 for (
int i = 0; i < nlanes; ++i)
517 operator vint64m1_t()
const
519 return vle64_v_i64m1(val, nlanes);
532 typedef double lane_type;
538 vse64_v_f64m1(val, v, nlanes);
542 double v[] = {v0, v1};
543 for (
int i = 0; i < nlanes; ++i)
548 operator vfloat64m1_t()
const
550 return vle64_v_f64m1(val, nlanes);
563 typedef uchar lane_type;
564 enum { nlanes = 16 };
574 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
575 *pval = vle8_v_u8m1(v, nlanes);
577 operator vuint8m1_t()
const
593 vuint8m1_t* pval = (vuint8m1_t*)val;
598 typedef schar lane_type;
599 enum { nlanes = 16 };
609 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
610 *pval = vle8_v_i8m1(v, nlanes);
612 operator vint8m1_t()
const
628 vint8m1_t* pval = (vint8m1_t*)val;
643 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
644 *pval = vle16_v_u16m1(v, nlanes);
646 operator vuint16m1_t()
const
663 vuint16m1_t* pval = (vuint16m1_t*)val;
668 typedef short lane_type;
676 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
678 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
679 *pval = vle16_v_i16m1(v, nlanes);
681 operator vint16m1_t()
const
698 vint16m1_t* pval = (vint16m1_t*)val;
703 typedef unsigned lane_type;
711 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
713 unsigned v[] = {v0, v1, v2, v3};
714 *pval = vle32_v_u32m1(v, nlanes);
716 operator vuint32m1_t()
const
720 unsigned get0()
const
733 vuint32m1_t* pval = (vuint32m1_t*)val;
738 typedef int lane_type;
746 v_int32x4(
int v0,
int v1,
int v2,
int v3)
748 int v[] = {v0, v1, v2, v3};
749 *pval = vle32_v_i32m1(v, nlanes);
751 operator vint32m1_t()
const
768 vint32m1_t* pval = (vint32m1_t*)val;
773 typedef float lane_type;
781 v_float32x4(
float v0,
float v1,
float v2,
float v3)
783 float v[] = {v0, v1, v2, v3};
784 *pval = vle32_v_f32m1(v, nlanes);
786 operator vfloat32m1_t()
const
792 return vfmv_f(*pval);
802 vfloat32m1_t* pval = (vfloat32m1_t*)val;
818 *pval = vle64_v_u64m1(v, nlanes);
820 operator vuint64m1_t()
const
837 vuint64m1_t* pval = (vuint64m1_t*)val;
842 typedef int64 lane_type;
852 int64 v[] = {v0, v1};
853 *pval = vle64_v_i64m1(v, nlanes);
855 operator vint64m1_t()
const
872 vint64m1_t* pval = (vint64m1_t*)val;
878 typedef double lane_type;
888 double v[] = {v0, v1};
889 *pval = vle64_v_f64m1(v, nlanes);
891 operator vfloat64m1_t()
const
897 return vfmv_f(*pval);
908 vfloat64m1_t* pval = (vfloat64m1_t*)val;
915#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
916inline v_##_Tpvec v_setzero_##suffix1() \
918 return v_##_Tpvec(vmv_v_x_##suffix2##m1(0, vl)); \
920inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
922 return v_##_Tpvec(vmv_v_x_##suffix2##m1(v, vl)); \
925OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16,
uchar, u8, u8, 16)
926OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16,
schar, s8, i8, 16)
927OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8,
ushort, u16, u16, 8)
928OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8,
short, s16, i16, 8)
929OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4,
unsigned, u32, u32, 4)
930OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4,
int, s32, i32, 4)
931OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2,
uint64, u64, u64, 2)
932OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2,
int64, s64, i64, 2)
934#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
935inline v_##_Tpv v_setzero_##suffix() \
937 return v_##_Tpv(vfmv_v_f_##suffix##m1(0, vl)); \
939inline v_##_Tpv v_setall_##suffix(_Tp v) \
941 return v_##_Tpv(vfmv_v_f_##suffix##m1(v, vl)); \
944OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4,
float, f32, 4)
946OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2,
double, f64, 2)
951#define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
952inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
954OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
955OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
956OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
957OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
958OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
959OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
960OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
961OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
962OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
964OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
967#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
968inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
970 return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
972inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
974 return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
977OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, int8x16, u8, s8, u8, i8)
978OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, int16x8, u16, s16, u16, i16)
979OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, int32x4, u32, s32, u32, i32)
980OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, float32x4, u32, f32, u32, f32)
981OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, float32x4, s32, f32, i32, f32)
982OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, int64x2, u64, s64, u64, i64)
984OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, float64x2, u64, f64, u64, f64)
985OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64x2, float64x2, s64, f64, i64, f64)
987OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint16x8, u8, u16, u8, u16)
988OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint32x4, u8, u32, u8, u32)
989OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint64x2, u8, u64, u8, u64)
990OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint32x4, u16, u32, u16, u32)
991OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint64x2, u16, u64, u16, u64)
992OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, uint64x2, u32, u64, u32, u64)
993OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int16x8, s8, s16, i8, i16)
994OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int32x4, s8, s32, i8, i32)
995OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int64x2, s8, s64, i8, i64)
996OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int32x4, s16, s32, i16, i32)
997OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int64x2, s16, s64, i16, i64)
998OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, int64x2, s32, s64, i32, i64)
1001#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
1002inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
1004 return v_##_Tpvec1(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v)));\
1006inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
1008 return v_##_Tpvec2(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v)));\
1011OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int16x8, u8, s16, u, i, 8, 16)
1012OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int32x4, u8, s32, u, i, 8, 32)
1013OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int64x2, u8, s64, u, i, 8, 64)
1014OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int8x16, u16, s8, u, i, 16, 8)
1015OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int32x4, u16, s32, u, i, 16, 32)
1016OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int64x2, u16, s64, u, i, 16, 64)
1017OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int8x16, u32, s8, u, i, 32, 8)
1018OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int16x8, u32, s16, u, i, 32, 16)
1019OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int64x2, u32, s64, u, i, 32, 64)
1020OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int8x16, u64, s8, u, i, 64, 8)
1021OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int16x8, u64, s16, u, i, 64, 16)
1022OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int32x4, u64, s32, u, i, 64, 32)
1023OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float32x4, u8, f32, u, f, 8, 32)
1024OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float32x4, u16, f32, u, f, 16, 32)
1025OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, float32x4, u64, f32, u, f, 64, 32)
1026OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float32x4, s8, f32, i, f, 8, 32)
1027OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float32x4, s16, f32, i, f, 16, 32)
1028OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64x2, float32x4, s64, f32, i, f, 64, 32)
1030OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float64x2, u8, f64, u, f, 8, 64)
1031OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float64x2, u16, f64, u, f, 16, 64)
1032OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, float64x2, u32, f64, u, f, 32, 64)
1033OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float64x2, s8, f64, i, f, 8, 64)
1034OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float64x2, s16, f64, i, f, 16, 64)
1035OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32x4, float64x2, s32, f64, i, f, 32, 64)
1042 return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v))));\
1046 return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v))));\
1052#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vmv, vl) \
1054inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1056 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
1058template<int i> inline _Tp v_extract_n(_Tpvec v) \
1060 return _Tp(vmv(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), v, i, vl))); \
1064OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_uint8x16,
uchar, u8, vmv_x_s_u8m1_u8, 16)
1065OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_int8x16,
schar, i8, vmv_x_s_i8m1_i8, 16)
1066OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_uint16x8,
ushort, u16, vmv_x_s_u16m1_u16, 8)
1067OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_int16x8,
short, i16, vmv_x_s_i16m1_i16, 8)
1068OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_uint32x4,
uint, u32, vmv_x_s_u32m1_u32, 4)
1069OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_int32x4,
int, i32, vmv_x_s_i32m1_i32, 4)
1070OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_uint64x2,
uint64, u64, vmv_x_s_u64m1_u64, 2)
1071OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(
v_int64x2,
int64, i64, vmv_x_s_i64m1_i64, 2)
1073#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vmv, vl) \
1075inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1077 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
1079template<int i> inline _Tp v_extract_n(_Tpvec v) \
1081 return _Tp(vmv(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), v, i, vl))); \
1084OPENCV_HAL_IMPL_RVV_EXTRACT_FP(
v_float32x4,
float, f32, vfmv_f_s_f32m1_f32, 4)
1086OPENCV_HAL_IMPL_RVV_EXTRACT_FP(
v_float64x2,
double, f64, vfmv_f_s_f64m1_f64, 2)
1091#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
1092inline _Tpvec v_load(const _Tp* ptr) \
1094 return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
1096inline _Tpvec v_load_aligned(const _Tp* ptr) \
1098 return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
1100inline _Tpvec v_load_low(const _Tp* ptr) \
1102 _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
1105inline void v_store(_Tp* ptr, const _Tpvec& a) \
1107 vse##width##_v_##suffix##m1(ptr, a, vl); \
1109inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1111 vse##width##_v_##suffix##m1(ptr, a, vl); \
1113inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1115 vse##width##_v_##suffix##m1(ptr, a, vl); \
1117inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
1119 vse##width##_v_##suffix##m1(ptr, a, vl); \
1121inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1123 vse##width##_v_##suffix##m1(ptr, a, hvl); \
1125inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1127 vse##width##_v_##suffix##m1(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
1130OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint8x16, vuint8m1_t,
uchar, 8, 16, 8, u8, vmv_v_x_u8m1)
1131OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int8x16, vint8m1_t,
schar, 8, 16, 8, i8, vmv_v_x_i8m1)
1132OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint16x8, vuint16m1_t,
ushort, 4, 8, 16, u16, vmv_v_x_u16m1)
1133OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int16x8, vint16m1_t,
short, 4, 8, 16, i16, vmv_v_x_i16m1)
1134OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint32x4, vuint32m1_t,
unsigned, 2, 4, 32, u32, vmv_v_x_u32m1)
1135OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int32x4, vint32m1_t,
int, 2, 4, 32, i32, vmv_v_x_i32m1)
1136OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_uint64x2, vuint64m1_t,
uint64, 1, 2, 64, u64, vmv_v_x_u64m1)
1137OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_int64x2, vint64m1_t,
int64, 1, 2, 64, i64, vmv_v_x_i64m1)
1138OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_float32x4, vfloat32m1_t,
float, 2, 4, 32, f32, vfmv_v_f_f32m1)
1140OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(
v_float64x2, vfloat64m1_t,
double, 1, 2, 64, f64, vfmv_v_f_f64m1)
1147 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
1148 ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
1150 return v_int8x16(vle8_v_i8m1(elems, 16));
1158 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
1160 return v_int16x8(vle16_v_i16m1(elems, 8));
1168 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
1170 return v_int32x4(vle32_v_i32m1(elems, 4));
1176 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
1188 return v_int64x2(vle64_v_i64m1(elems, 2));
1227 return v_int8x16(vle8_v_i8m1(elems, 16));
1250 return v_int8x16(vle8_v_i8m1(elems, 16));
1273 return v_int8x16(vle8_v_i8m1(elems, 16));
1292 return v_int16x8(vle16_v_i16m1(elems, 8));
1307 return v_int16x8(vle16_v_i16m1(elems, 8));
1322 return v_int16x8(vle16_v_i16m1(elems, 8));
1337 return v_int32x4(vle32_v_i32m1(elems, 4));
1348 return v_int32x4(vle32_v_i32m1(elems, 4));
1366 return v_int64x2(vle64_v_i64m1(elems, 2));
1406 tab[v_extract_n<0>(idxvec)],
1407 tab[v_extract_n<1>(idxvec)],
1408 tab[v_extract_n<2>(idxvec)],
1409 tab[v_extract_n<3>(idxvec)]
1411 return v_int32x4(vle32_v_i32m1(elems, 4));
1418 tab[v_extract_n<0>(idxvec)],
1419 tab[v_extract_n<1>(idxvec)],
1420 tab[v_extract_n<2>(idxvec)],
1421 tab[v_extract_n<3>(idxvec)]
1430 tab[v_extract_n<0>(idxvec)],
1431 tab[v_extract_n<1>(idxvec)],
1432 tab[v_extract_n<2>(idxvec)],
1433 tab[v_extract_n<3>(idxvec)]
1467 tab[v_extract_n<0>(idxvec)],
1468 tab[v_extract_n<1>(idxvec)]
1490 return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr, 16), 0, 16));
1496 unsigned ptr[16] = {0};
1501 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr, 16), 0, 16), 0, 16));
1517 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr, 16), 0, 16), 0, 16), 0, 16));
1521#define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, vl) \
1522inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1524 return _Tpvec(intrin(a, b, vl)); \
1526inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
1528 a = _Tpvec(intrin(a, b, vl)); \
1532OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint8x16, vsaddu_vv_u8m1, 16)
1533OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint8x16, vssubu_vv_u8m1, 16)
1534OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint8x16, vdivu_vv_u8m1, 16)
1535OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int8x16, vsadd_vv_i8m1, 16)
1536OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int8x16, vssub_vv_i8m1, 16)
1537OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int8x16, vdiv_vv_i8m1, 16)
1538OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint16x8, vsaddu_vv_u16m1, 8)
1539OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint16x8, vssubu_vv_u16m1, 8)
1540OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint16x8, vdivu_vv_u16m1, 8)
1541OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int16x8, vsadd_vv_i16m1, 8)
1542OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int16x8, vssub_vv_i16m1, 8)
1543OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int16x8, vdiv_vv_i16m1, 8)
1544OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint32x4, vadd_vv_u32m1, 4)
1545OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint32x4, vsub_vv_u32m1, 4)
1546OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_uint32x4, vmul_vv_u32m1, 4)
1547OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint32x4, vdivu_vv_u32m1, 4)
1548OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int32x4, vadd_vv_i32m1, 4)
1549OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int32x4, vsub_vv_i32m1, 4)
1550OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_int32x4, vmul_vv_i32m1, 4)
1551OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int32x4, vdiv_vv_i32m1, 4)
1552OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_float32x4, vfadd_vv_f32m1, 4)
1553OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_float32x4, vfsub_vv_f32m1, 4)
1554OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_float32x4, vfmul_vv_f32m1, 4)
1555OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_float32x4, vfdiv_vv_f32m1, 4)
1556OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_uint64x2, vadd_vv_u64m1, 2)
1557OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_uint64x2, vsub_vv_u64m1, 2)
1558OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_uint64x2, vmul_vv_u64m1, 2)
1559OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_uint64x2, vdivu_vv_u64m1, 2)
1560OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_int64x2, vadd_vv_i64m1, 2)
1561OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_int64x2, vsub_vv_i64m1, 2)
1562OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_int64x2, vmul_vv_i64m1, 2)
1563OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_int64x2, vdiv_vv_i64m1, 2)
1565OPENCV_HAL_IMPL_RVV_BIN_OP(+,
v_float64x2, vfadd_vv_f64m1, 2)
1566OPENCV_HAL_IMPL_RVV_BIN_OP(-,
v_float64x2, vfsub_vv_f64m1, 2)
1567OPENCV_HAL_IMPL_RVV_BIN_OP(*,
v_float64x2, vfmul_vv_f64m1, 2)
1568OPENCV_HAL_IMPL_RVV_BIN_OP(/,
v_float64x2, vfdiv_vv_f64m1, 2)
1574#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, vl) \
1575OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, vl) \
1576OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, vl) \
1577OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, vl) \
1578inline _Tpvec operator ~ (const _Tpvec& a) \
1580 return _Tpvec(vnot_v_##suffix##m1(a, vl)); \
1583OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint8x16, u8, 16)
1584OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int8x16, i8, 16)
1585OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint16x8, u16, 8)
1586OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int16x8, i16, 8)
1587OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint32x4, u32, 4)
1588OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int32x4, i32, 4)
1589OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_uint64x2, u64, 2)
1590OPENCV_HAL_IMPL_RVV_LOGIC_OP(
v_int64x2, i64, 2)
1592#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
1593inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
1595 return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
1597inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
1599 a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
1603OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
1604OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
1605OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
1609 return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a), 4)));
1613#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
1614inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
1616 return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
1618inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
1620 a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
1624OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
1625OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
1626OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
1630 return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a), 2)));
1636#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
1637inline _Tpvec operator << (const _Tpvec& a, int n) \
1639 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1641inline _Tpvec operator >> (const _Tpvec& a, int n) \
1643 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
1645template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1647 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1649template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1651 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
1654#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
1655inline _Tpvec operator << (const _Tpvec& a, int n) \
1657 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1659inline _Tpvec operator >> (const _Tpvec& a, int n) \
1661 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
1663template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1665 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1667template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1669 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
1672OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint8x16, u8, 16)
1673OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint16x8, u16, 8)
1674OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint32x4, u32, 4)
1675OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(
v_uint64x2, u64, 2)
1676OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int8x16, i8, 16)
1677OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int16x8, i16, 8)
1678OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int32x4, i32, 4)
1679OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(
v_int64x2, i64, 2)
1684#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
1685inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1687 uint64_t ones = -1; \
1688 return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl)); \
1691#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
1692inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1694 union { uint64 u; double d; } ones; ones.u = -1; \
1695 return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
1698#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width, vl) \
1699OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
1700OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
1701OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, vl) \
1702OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, vl) \
1703OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, vl) \
1704OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, vl)
1706#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width, vl) \
1707OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
1708OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
1709OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, vl) \
1710OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, vl) \
1711OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, vl) \
1712OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, vl)
1714#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width, vl) \
1715OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, vl) \
1716OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, vl) \
1717OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, vl) \
1718OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, vl) \
1719OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, vl) \
1720OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, vl)
1723OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint8x16, u8, 8, 16)
1724OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint16x8, u16, 16, 8)
1725OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint32x4, u32, 32, 4)
1726OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(
v_uint64x2, u64, 64, 2)
1727OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int8x16, i8, 8, 16)
1728OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int16x8, i16, 16, 8)
1729OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int32x4, i32, 32, 4)
1730OPENCV_HAL_IMPL_RVV_SIGNED_CMP(
v_int64x2, i64, 64, 2)
1731OPENCV_HAL_IMPL_RVV_FLOAT_CMP(
v_float32x4, f32, 32, 4)
1733OPENCV_HAL_IMPL_RVV_FLOAT_CMP(
v_float64x2, f64, 64, 2)
1746#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
1747inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1749 return _Tpvec(intrin(a, b, vl)); \
1752OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_min, vminu_vv_u8m1, 16)
1753OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
1754OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_min, vmin_vv_i8m1, 16)
1755OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_max, vmax_vv_i8m1, 16)
1756OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_min, vminu_vv_u16m1, 8)
1757OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
1758OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_min, vmin_vv_i16m1, 8)
1759OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_max, vmax_vv_i16m1, 8)
1760OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint32x4, v_min, vminu_vv_u32m1, 4)
1761OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
1762OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int32x4, v_min, vmin_vv_i32m1, 4)
1763OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int32x4, v_max, vmax_vv_i32m1, 4)
1764OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float32x4, v_min, vfmin_vv_f32m1, 4)
1765OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float32x4, v_max, vfmax_vv_f32m1, 4)
1766OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint64x2, v_min, vminu_vv_u64m1, 2)
1767OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint64x2, v_max, vmaxu_vv_u64m1, 2)
1768OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int64x2, v_min, vmin_vv_i64m1, 2)
1769OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int64x2, v_max, vmax_vv_i64m1, 2)
1771OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float64x2, v_min, vfmin_vv_f64m1, 2)
1772OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_float64x2, v_max, vfmax_vv_f64m1, 2)
1777OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
1778OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
1779OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
1780OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
1781OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
1782OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
1783OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
1784OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
1785OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
1786OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
1787OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
1788OPENCV_HAL_IMPL_RVV_BIN_FUNC(
v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
1792#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
1793inline scalartype v_reduce_sum(const _Tpvec& a) \
1795 _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
1796 _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
1797 res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
1798 return (scalartype)(_wTpvec(res).get0()); \
1801OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_uint8x16,
v_uint16x8, vuint16m1_t,
unsigned, u8, u16, 16, wredsumu)
1802OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_int8x16,
v_int16x8, vint16m1_t,
int, i8, i16, 16, wredsum)
1803OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_uint16x8,
v_uint32x4, vuint32m1_t,
unsigned, u16, u32, 8, wredsumu)
1804OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_int16x8,
v_int32x4, vint32m1_t,
int, i16, i32, 8, wredsum)
1805OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_uint32x4,
v_uint64x2, vuint64m1_t,
unsigned, u32, u64, 4, wredsumu)
1806OPENCV_HAL_IMPL_RVV_REDUCE_SUM(
v_int32x4,
v_int64x2, vint64m1_t,
int, i32, i64, 4, wredsum)
1810#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
1811inline scalartype v_reduce_sum(const _Tpvec& a) \
1813 _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
1814 _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
1815 res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
1816 return (scalartype)(_wTpvec(res).get0()); \
1826#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
1827inline scalartype v_reduce_##func(const _Tpvec& a) \
1829 _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a, vl)); \
1830 return scalartype(res.get0()); \
1836OPENCV_HAL_IMPL_RVV_REDUCE(
v_int16x8, min,
short, i16, 8, redmin)
1837OPENCV_HAL_IMPL_RVV_REDUCE(
v_uint32x4, min,
unsigned, u32, 4, redminu)
1838OPENCV_HAL_IMPL_RVV_REDUCE(
v_int32x4, min,
int, i32, 4, redmin)
1839OPENCV_HAL_IMPL_RVV_REDUCE(
v_float32x4, min,
float, f32, 4, fredmin)
1843OPENCV_HAL_IMPL_RVV_REDUCE(
v_int16x8, max,
short, i16, 8, redmax)
1844OPENCV_HAL_IMPL_RVV_REDUCE(
v_uint32x4, max,
unsigned, u32, 4, redmaxu)
1845OPENCV_HAL_IMPL_RVV_REDUCE(
v_int32x4, max,
int, i32, 4, redmax)
1846OPENCV_HAL_IMPL_RVV_REDUCE(
v_float32x4, max,
float, f32, 4, fredmax)
1872 return one / v_sqrt(
x);
1884 return one / v_sqrt(
x);
1890 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
1896 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
1902 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
1908 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
1920 return v_int32x4(vmacc_vv_i32m1(c, a, b, 4));
1925 return v_fma(a, b, c);
1930 return v_fma(a, b, c);
1941 return v_fma(a, b, c);
1949#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
1950inline bool v_check_all(const _Tpvec& a) \
1952 auto v0 = vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl); \
1953 v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
1954 return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) == 0; \
1956inline bool v_check_any(const _Tpvec& a) \
1958 auto v0 = vsrl_vx_##suffix##m1(a, shift, vl); \
1959 v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
1960 return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) != 0; \
1963OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_uint8x16, u8, 7, 16)
1964OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_uint16x8, u16, 15, 8)
1965OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_uint32x4, u32, 31, 4)
1970 return (v.val[0] | v.val[1]) == 0;
1975 return (v.val[0] | v.val[1]) != 0;
2010#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
2011inline bool v_check_all(const _Tpvec& a) \
2013 return vcpop(vmslt(a, 0, vl), vl) == vl; \
2015inline bool v_check_any(const _Tpvec& a) \
2017 return vcpop(vmslt(a, 0, vl), vl) != 0; \
2020OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_int8x16, 16)
2021OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_int16x8, 8)
2022OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_int32x4, 4)
2023OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(
v_int64x2, 2)
2060#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
2061inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
2063 return v_max(a, b) - v_min(a, b); \
2073OPENCV_HAL_IMPL_RVV_ABSDIFF(
v_int8x16, absdiffs)
2074OPENCV_HAL_IMPL_RVV_ABSDIFF(
v_int16x8, absdiffs)
2076#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(ivec, uvec, itype, utype, isuf, usuf, vlen) \
2077inline uvec v_absdiff(const ivec& a, const ivec& b) \
2079 itype max = vmax_vv_##isuf(a, b, vlen); \
2080 itype min = vmin_vv_##isuf(a, b, vlen); \
2081 return uvec(vreinterpret_v_##isuf##_##usuf(vsub_vv_##isuf(max, min, vlen))); \
2084OPENCV_HAL_IMPL_RVV_ABSDIFF_S(
v_int8x16,
v_uint8x16, vint8m1_t, vuint8m1_t, i8m1, u8m1, 16)
2085OPENCV_HAL_IMPL_RVV_ABSDIFF_S(
v_int16x8,
v_uint16x8, vint16m1_t, vuint16m1_t, i16m1, u16m1, 8)
2086OPENCV_HAL_IMPL_RVV_ABSDIFF_S(
v_int32x4,
v_uint32x4, vint32m1_t, vuint32m1_t, i32m1, u32m1, 4)
2088#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
2089inline _Tprvec v_abs(const _Tpvec& a) \
2091 return v_absdiff(a, v_setzero_##suffix()); \
2103#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
2104inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
2106 return v_reduce_sum(v_absdiff(a, b)); \
2109OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_uint8x16,
unsigned)
2110OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_int8x16,
unsigned)
2111OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_uint16x8,
unsigned)
2112OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_int16x8,
unsigned)
2113OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_uint32x4,
unsigned)
2114OPENCV_HAL_IMPL_RVV_REDUCE_SAD(
v_int32x4,
unsigned)
2119#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, vl) \
2120inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
2122 return _Tpvec(merge(ne(mask, 0, vl), b, a, vl)); \
2125OPENCV_HAL_IMPL_RVV_SELECT(
v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 16)
2126OPENCV_HAL_IMPL_RVV_SELECT(
v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 16)
2127OPENCV_HAL_IMPL_RVV_SELECT(
v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 8)
2128OPENCV_HAL_IMPL_RVV_SELECT(
v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 8)
2129OPENCV_HAL_IMPL_RVV_SELECT(
v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 4)
2130OPENCV_HAL_IMPL_RVV_SELECT(
v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 4)
2131OPENCV_HAL_IMPL_RVV_SELECT(
v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 4)
2133OPENCV_HAL_IMPL_RVV_SELECT(
v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 2)
2138#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
2139template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
2141 return _Tpvec(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
2143template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
2145 return _Tpvec(vslideup_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
2147template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
2149template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
2151 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
2153template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
2155 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
2157template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
2158{ CV_UNUSED(b); return a; }
2160OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_uint8x16, u8, 16)
2161OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_int8x16, i8, 16)
2162OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_uint16x8, u16, 8)
2163OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_int16x8, i16, 8)
2164OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_uint32x4, u32, 4)
2165OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_int32x4, i32, 4)
2166OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_uint64x2, u64, 2)
2167OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(
v_int64x2, i64, 2)
2169#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
2170template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
2172 return _Tpvec(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
2174template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
2176 return _Tpvec(vslideup_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
2178template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
2180template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
2182 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
2184template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
2186 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
2188template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
2189{ CV_UNUSED(b); return a; }
2207 double arr[4] = {a.val[0], a.val[1], 0, 0};
2208 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2214 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2215 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2221 vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
2222 return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
2226 vfloat64m2_t
dst = vlmul_ext_v_f64m1_f64m2(a);
2227 return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(
dst, 1, b), 4));
2233 double ptr[4] = {0};
2234 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
2244 double ptr[4] = {0};
2245 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
2255 double ptr[4] = {0};
2256 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
2266 double ptr[4] = {0};
2267 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
2283#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
2284template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
2286 return v_setall_##suffix(v_extract_n<i>(v)); \
2290OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int8x16, s8)
2291OPENCV_HAL_IMPL_RVV_BROADCAST(
v_uint16x8, u16)
2292OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int16x8, s16)
2293OPENCV_HAL_IMPL_RVV_BROADCAST(
v_uint32x4, u32)
2294OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int32x4, s32)
2295OPENCV_HAL_IMPL_RVV_BROADCAST(
v_uint64x2, u64)
2296OPENCV_HAL_IMPL_RVV_BROADCAST(
v_int64x2, s64)
2304#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
2305inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2306 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2307 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2308 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2312 v_extract_n<0>(a0), \
2313 v_extract_n<0>(a1), \
2314 v_extract_n<0>(a2), \
2315 v_extract_n<0>(a3) \
2317 b0 = v_load(elems0); \
2320 v_extract_n<1>(a0), \
2321 v_extract_n<1>(a1), \
2322 v_extract_n<1>(a2), \
2323 v_extract_n<1>(a3) \
2325 b1 = v_load(elems1); \
2328 v_extract_n<2>(a0), \
2329 v_extract_n<2>(a1), \
2330 v_extract_n<2>(a2), \
2331 v_extract_n<2>(a3) \
2333 b2 = v_load(elems2); \
2336 v_extract_n<3>(a0), \
2337 v_extract_n<3>(a1), \
2338 v_extract_n<3>(a2), \
2339 v_extract_n<3>(a3) \
2341 b3 = v_load(elems3); \
2344OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4,
unsigned, u32)
2345OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4,
int, i32)
2346OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4,
float, f32)
2350#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, suffix) \
2351inline _Tpvec v_reverse(const _Tpvec& a) \
2353 _Tp ptr[_Tpvec::nlanes] = {0}; \
2354 _Tp ptra[_Tpvec::nlanes] = {0}; \
2356 for (int i = 0; i < _Tpvec::nlanes; i++) \
2358 ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
2360 return v_load(ptr); \
2366OPENCV_HAL_IMPL_RVV_REVERSE(
v_int16x8,
short, i16)
2367OPENCV_HAL_IMPL_RVV_REVERSE(
v_uint32x4,
unsigned, u32)
2368OPENCV_HAL_IMPL_RVV_REVERSE(
v_int32x4,
int, i32)
2369OPENCV_HAL_IMPL_RVV_REVERSE(
v_float32x4,
float, f32)
2373OPENCV_HAL_IMPL_RVV_REVERSE(
v_float64x2,
double, f64)
2378#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt, vl) \
2379inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2381 _Tp lptr[_Tpvec::nlanes/2] = {0}; \
2382 _Tp hptr[_Tpvec::nlanes/2] = {0}; \
2383 v_store_low(lptr, a); \
2384 v_store_high(hptr, a); \
2385 b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
2386 b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
2388inline _Tpwvec v_expand_low(const _Tpvec& a) \
2390 _Tp lptr[_Tpvec::nlanes/2] = {0}; \
2391 v_store_low(lptr, a); \
2392 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
2394inline _Tpwvec v_expand_high(const _Tpvec& a) \
2396 _Tp hptr[_Tpvec::nlanes/2] = {0}; \
2397 v_store_high(hptr, a); \
2398 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
2400inline _Tpwvec v_load_expand(const _Tp* ptr) \
2402 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr, vl), vl)); \
2408OPENCV_HAL_IMPL_RVV_EXPAND(
v_int32x4,
short,
v_int16x8, 16, i16, vwcvt_x_x_v_i32m1, 4)
2414 return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr, 4), 4), 4));
2419 return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr, 4), 4), 4));
2423#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, shr, hvl, vl) \
2424inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
2426 _wTp arr[_Tpvec::nlanes] = {0}; \
2428 v_store(arr + _wTpvec::nlanes, b); \
2429 return _Tpvec(shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl)); \
2431inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
2433 _wTp arr[_Tpvec::nlanes] = {0}; \
2435 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2436 vse##hwidth##_v_##hsuffix##m1(ptr, shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl), hvl); \
2438template<int n> inline \
2439_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
2441 _wTp arr[_Tpvec::nlanes] = {0}; \
2443 v_store(arr + _wTpvec::nlanes, b); \
2444 return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)); \
2446template<int n> inline \
2447void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
2449 _wTp arr[_Tpvec::nlanes] = {0}; \
2451 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2452 vse##hwidth##_v_##hsuffix##m1(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)), hvl); \
2455OPENCV_HAL_IMPL_RVV_PACK(
v_uint8x16,
uchar,
v_uint16x8,
ushort, 8, 16, u8, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1, 8, 16)
2456OPENCV_HAL_IMPL_RVV_PACK(
v_int8x16,
schar,
v_int16x8,
short, 8, 16, i8, i16, vnclip_wx_i8m1, vnclip_wx_i8m1, 8, 16)
2457OPENCV_HAL_IMPL_RVV_PACK(
v_uint16x8,
ushort,
v_uint32x4,
unsigned, 16, 32, u16, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1, 4, 8)
2458OPENCV_HAL_IMPL_RVV_PACK(
v_int16x8,
short,
v_int32x4,
int, 16, 32, i16, i32, vnclip_wx_i16m1, vnclip_wx_i16m1, 4, 8)
2459OPENCV_HAL_IMPL_RVV_PACK(
v_uint32x4,
unsigned,
v_uint64x2,
uint64, 32, 64, u32, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1, 2, 4)
2460OPENCV_HAL_IMPL_RVV_PACK(
v_int32x4,
int,
v_int64x2,
int64, 32, 64, i32, i64, vnclip_wx_i32m1, vnsra_wx_i32m1, 2, 4)
2463#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
2464inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
2466 _wTp arr[_Tpvec::nlanes] = {0}; \
2468 v_store(arr + _wTpvec::nlanes, b); \
2469 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl)); \
2471inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2473 _wTp arr[_Tpvec::nlanes] = {0}; \
2475 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2476 vse##hwidth##_v_##hsuffix##m1(ptr, rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl), hvl); \
2478template<int n> inline \
2479_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
2481 _wTp arr[_Tpvec::nlanes] = {0}; \
2483 v_store(arr + _wTpvec::nlanes, b); \
2484 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl)); \
2486template<int n> inline \
2487void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2489 _wTp arr[_Tpvec::nlanes] = {0}; \
2491 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2492 v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl))); \
2495OPENCV_HAL_IMPL_RVV_PACK_U(
v_uint8x16,
uchar,
v_int16x8,
short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, 8, 16)
2496OPENCV_HAL_IMPL_RVV_PACK_U(
v_uint16x8,
ushort,
v_int32x4,
int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, 4, 8)
2499#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, suffix) \
2500inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2502 _Tp ptra0[v_##_Tpvec::nlanes] = {0}; \
2503 _Tp ptra1[v_##_Tpvec::nlanes] = {0}; \
2504 _Tp ptrb0[v_##_Tpvec::nlanes] = {0}; \
2505 _Tp ptrb1[v_##_Tpvec::nlanes] = {0}; \
2506 v_store(ptra0, a0); \
2507 v_store(ptra1, a1); \
2509 for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
2511 ptrb0[i*2] = ptra0[i]; \
2512 ptrb0[i*2+1] = ptra1[i]; \
2514 for( ; i < v_##_Tpvec::nlanes; i++ ) \
2516 ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
2517 ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
2519 b0 = v_load(ptrb0); \
2520 b1 = v_load(ptrb1); \
2522inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2524 _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
2525 _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2526 v_store_low(ptra, a); \
2527 v_store_low(ptrb, b); \
2528 return v_load_halves(ptra, ptrb); \
2530inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2532 _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
2533 _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2534 v_store_high(ptra, a); \
2535 v_store_high(ptrb, b); \
2536 return v_load_halves(ptra, ptrb); \
2538inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2540 c = v_combine_low(a, b); \
2541 d = v_combine_high(a, b); \
2544OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16,
uchar, u8)
2545OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16,
schar, i8)
2546OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8,
ushort, u16)
2547OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8,
short, i16)
2548OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4,
unsigned, u32)
2549OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4,
int, i32)
2550OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4,
float, f32)
2552OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2,
double, f64)
2556#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp) \
2557inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2559 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2560 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2562 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2564 ptra[i] = ptr[i2]; \
2565 ptrb[i] = ptr[i2+1]; \
2570inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2572 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2573 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2574 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2576 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2578 ptra[i] = ptr[i3]; \
2579 ptrb[i] = ptr[i3+1]; \
2580 ptrc[i] = ptr[i3+2]; \
2586inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2587 v_##_Tpvec& c, v_##_Tpvec& d) \
2589 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2590 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2591 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2592 _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
2594 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2596 ptra[i] = ptr[i4]; \
2597 ptrb[i] = ptr[i4+1]; \
2598 ptrc[i] = ptr[i4+2]; \
2599 ptrd[i] = ptr[i4+3]; \
2606inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2607 hal::StoreMode =hal::STORE_UNALIGNED) \
2610 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2611 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2614 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2616 ptr[i2] = ptra[i]; \
2617 ptr[i2+1] = ptrb[i]; \
2620inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2621 const v_##_Tpvec& c, hal::StoreMode =hal::STORE_UNALIGNED) \
2624 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2625 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2626 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2630 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2632 ptr[i3] = ptra[i]; \
2633 ptr[i3+1] = ptrb[i]; \
2634 ptr[i3+2] = ptrc[i]; \
2637inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2638 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2639 hal::StoreMode =hal::STORE_UNALIGNED ) \
2642 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2643 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2644 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2645 _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
2650 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2652 ptr[i4] = ptra[i]; \
2653 ptr[i4+1] = ptrb[i]; \
2654 ptr[i4+2] = ptrc[i]; \
2655 ptr[i4+3] = ptrd[i]; \
2658inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
2660 _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
2661 _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
2662 v_store(ptrvec, vec); \
2663 for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
2665 ptr[4*i ] = ptrvec[4*i ]; \
2666 ptr[4*i+1] = ptrvec[4*i+2]; \
2667 ptr[4*i+2] = ptrvec[4*i+1]; \
2668 ptr[4*i+3] = ptrvec[4*i+3]; \
2670 return v_load(ptr); \
2672inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
2674 _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
2675 _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
2676 v_store(ptrvec, vec); \
2677 for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
2679 ptr[8*i ] = ptrvec[8*i ]; \
2680 ptr[8*i+1] = ptrvec[8*i+4]; \
2681 ptr[8*i+2] = ptrvec[8*i+1]; \
2682 ptr[8*i+3] = ptrvec[8*i+5]; \
2683 ptr[8*i+4] = ptrvec[8*i+2]; \
2684 ptr[8*i+5] = ptrvec[8*i+6]; \
2685 ptr[8*i+6] = ptrvec[8*i+3]; \
2686 ptr[8*i+7] = ptrvec[8*i+7]; \
2688 return v_load(ptr); \
2691OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16,
uchar)
2692OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16,
schar)
2693OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8,
ushort)
2694OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8,
short)
2695OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4,
unsigned)
2696OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4,
int)
2697OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4,
float)
2698OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2,
uint64)
2699OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2,
int64)
2701OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2,
double)
2706static const unsigned char popCountTable[] =
2708 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2709 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2710 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2711 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2712 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2713 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2714 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2715 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2716 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2717 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2718 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2719 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2720 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2721 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2722 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2723 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2726#define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
2727inline _rTpvec v_popcount(const _Tpvec& a) \
2729 uchar ptra[16] = {0}; \
2730 v_store(ptra, v_reinterpret_as_u8(a)); \
2731 _rTp ptr[_Tpvec::nlanes] = {0}; \
2732 v_store(ptr, v_setzero_##suffix()); \
2733 for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
2734 ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
2735 return v_load(ptr); \
2750#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, vl, shift) \
2751inline int v_signmask(const _Tpvec& a) \
2754 _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift, vl)); \
2755 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
2756 mask |= (int)(tmp.val[i]) << i; \
2762OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_uint32x4,
unsigned, u32, 4, 31)
2766{
return v_signmask(v_reinterpret_as_u8(a)); }
2768{
return v_signmask(v_reinterpret_as_u16(a)); }
2770{
return v_signmask(v_reinterpret_as_u32(a)); }
2772{
return v_signmask(v_reinterpret_as_u32(a)); }
2774{
return v_signmask(v_reinterpret_as_u64(a)); }
2777{
return v_signmask(v_reinterpret_as_u64(a)); }
2781#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, width, vl) \
2782inline int v_signmask(const _Tpvec& a) \
2784 uint8_t ans[16] = {0};\
2785 vsm(ans, vmslt(a, 0, vl), vl);\
2786 return reinterpret_cast<int*>(ans)[0] & ((1 << (vl)) - 1);\
2789OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_int8x16, 8, 16)
2790OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_int16x8, 16, 8)
2791OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_int32x4, 32, 4)
2792OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(
v_int64x2, 64, 2)
2795{
return v_signmask(v_reinterpret_as_s8(a)); }
2797{
return v_signmask(v_reinterpret_as_s16(a)); }
2799{
return v_signmask(v_reinterpret_as_s32(a)); }
2801{
return v_signmask(v_reinterpret_as_s32(a)); }
2803{
return v_signmask(v_reinterpret_as_s64(a)); }
2806{
return v_signmask(v_reinterpret_as_s64(a)); }
2813#define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
2814inline int v_scan_forward(const _Tpvec& a) \
2816 _Tp ptr[_Tpvec::nlanes] = {0}; \
2817 v_store(ptr, v_reinterpret_as_##suffix(a)); \
2818 for (int i = 0; i < _Tpvec::nlanes; i++) \
2819 if(int(ptr[i]) < 0) \
2827OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_int16x8,
short, s16)
2828OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_uint32x4,
unsigned, u32)
2829OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_int32x4,
int, s32)
2830OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_float32x4,
float, f32)
2834OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(
v_float64x2,
double, f64)
2841 const uint64 ptr[2] = {0x0908060504020100, 0xFFFFFF0F0E0D0C0A};
2845 v_reinterpret_as_u8(vec),
2846 v_reinterpret_as_u8(
flags),
2851 return v_reinterpret_as_u8(
v_pack_triplets(v_reinterpret_as_s8(vec)));
2856 const uint64 ptr[2] = {0x0908050403020100, 0xFFFF0F0E0D0C0B0A};
2860 v_reinterpret_as_u8(vec),
2861 v_reinterpret_as_u8(
flags),
2866 return v_reinterpret_as_u16(
v_pack_triplets(v_reinterpret_as_s16(vec)));
2878 return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
2883 vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
2890 for(
int i = 0; i < N; i++ ) buf[i] = (
float)ptr[i];
2899 for(
int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
2907 return v_int32x4(vfcvt_x_f_v_i32m1(a, 4));
2914 return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
2921 return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
2926#ifndef CV_RVV_THEAD_0_7
2927 return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a, 4));
2929 const int old_round =
fesetround(FE_TOWARDZERO);
2930 vint32m1_t val = vfcvt_x_f_v_i32m1(a, 4);
2939 double arr[4] = {a.val[0], a.val[1], 0, 0};
2940 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2941 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2946 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2947 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2948 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2953 double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
2954 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2955 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2960 double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
2961 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2962 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2967 double arr[4] = {a.val[0], a.val[1], 0, 0};
2968 vfloat64m2_t tmp = vle64_v_f64m2(
arr, 4);
2969#ifndef CV_RVV_THEAD_0_7
2970 return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp, 4));
2972 const int old_round =
fesetround(FE_TOWARDZERO);
2973 vint32m1_t val = vfncvt_x_f_w_i32m1(tmp, 4);
2982 vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
2983 return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
2988 vfloat64m2_t
dst = vlmul_ext_v_f64m1_f64m2(a);
2989 return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(
dst, 1, b), 4));
2994 vfloat64m2_t
dst = vfmv_v_f_f64m2(0, 4);
2995 dst = vset_v_f64m1_f64m2(
dst, 0, a);
2996 dst = vfsub_vf_f64m2(
dst, 0.5, 2);
3002 vfloat64m2_t
dst = vfmv_v_f_f64m2(0, 4);
3003 dst = vset_v_f64m1_f64m2(
dst, 0, a);
3004 dst = vfadd_vf_f64m2(
dst, 0.5, 2);
3010 vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
3011 return v_int32x4(vfncvt_rtz_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
3024 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3032 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3042 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3050 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3058 unsigned ptr[16] = {0};
3060 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3062 return t1 + t2 + t3 + t4;
3067 unsigned ptr[16] = {0};
3069 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3071 return t1 + t2 + t3 + t4 + c;
3078 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3080 return t1 + t2 + t3 + t4;
3087 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3089 return t1 + t2 + t3 + t4 + c;
3097 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3099 return t1 + t2 + t3 + t4;
3105 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3107 return t1 + t2 + t3 + t4 + c;
3114 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3116 return t1 + t2 + t3 + t4;
3123 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3125 return t1 + t2 + t3 + t4 + c;
3143 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3151 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3161 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3169 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3179 unsigned ptr[16] = {0};
3180 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3185 return t1 + t2 + t3 + t4;
3189 unsigned ptr[16] = {0};
3190 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3195 return t1 + t2 + t3 + t4 + c;
3200 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3205 return t1 + t2 + t3 + t4;
3210 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3215 return t1 + t2 + t3 + t4 + c;
3222 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3227 return t1 + t2 + t3 + t4;
3232 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3237 return t1 + t2 + t3 + t4 + c;
3242 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3247 return t1 + t2 + t3 + t4;
3252 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3257 return t1 + t2 + t3 + t4 + c;
3273 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
3274 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
3275 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
3276 res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3, 4);
3284 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
3285 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
3286 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
3290#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width, vl, hvl) \
3291inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
3293 _Tpw ptr[_Tpwvec::nlanes*2] = {0}; \
3294 vse##width##_v_##suffix##m2(ptr, wmul(a, b, vl), vl); \
3295 c = _Tpwvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
3296 d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes, hvl)); \
3300OPENCV_HAL_IMPL_RVV_MUL_EXPAND(
v_int8x16,
v_int16x8,
short, i16, vwmul_vv_i16m2, 16, 16, 8)
3301OPENCV_HAL_IMPL_RVV_MUL_EXPAND(
v_uint16x8,
v_uint32x4,
unsigned, u32, vwmulu_vv_u32m2, 32, 8, 4)
3302OPENCV_HAL_IMPL_RVV_MUL_EXPAND(
v_int16x8,
v_int32x4,
int, i32, vwmul_vv_i32m2, 32, 8, 4)
3308 return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b, 8), 16, 8));
3312 return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b, 8), 16, 8));
3318#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
3319inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
3322 v_mul_expand(a, b, c, d); \
3323 return v_pack(c, d); \
3325inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
3339CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
InputArrayOfArrays Size InputOutputArray InputOutputArray OutputArrayOfArrays OutputArrayOfArrays OutputArray OutputArray OutputArray int flags
Definition calib3d.hpp:1617
CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst)
Calculates the per-element absolute difference between two arrays or between an array and a scalar.
const int * idx
Definition core_c.h:668
CvArr * arr
Definition core_c.h:1247
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition intrin_cpp.hpp:2190
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition intrin_cpp.hpp:1033
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition intrin_cpp.hpp:1020
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition intrin_cpp.hpp:1584
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition intrin_cpp.hpp:1961
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_halves(const _Tp *loptr, const _Tp *hiptr)
Load register contents from two memory blocks.
Definition intrin_cpp.hpp:1781
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
OutputArray dst
Definition imgproc.hpp:3564
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441