7 #ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
8 #define OPENCV_HAL_INTRIN_RISCVV_HPP
12 #include "opencv2/core/utility.hpp"
19 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
22 #define CV_SIMD128_64F 1
26 typedef uchar lane_type;
34 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
35 val = (vuint8m1_t)vle8_v_u8m1((
unsigned char*)v, 16);
39 return vmv_x_s_u8m1_u8(val);
47 typedef schar lane_type;
51 explicit v_int8x16(vint8m1_t v) : val(v) {}
55 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56 val = (vint8m1_t)vle8_v_i8m1((
schar*)v, 16);
60 return vmv_x_s_i8m1_i8(val);
75 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
76 val = (vuint16m1_t)vle16_v_u16m1((
unsigned short*)v, 8);
80 return vmv_x_s_u16m1_u16(val);
88 typedef short lane_type;
92 explicit v_int16x8(vint16m1_t v) : val(v) {}
93 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
95 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96 val = (vint16m1_t)vle16_v_i16m1((
signed short*)v, 8);
100 return vmv_x_s_i16m1_i16(val);
108 typedef unsigned lane_type;
112 explicit v_uint32x4(vuint32m1_t v) : val(v) {}
113 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
115 unsigned v[] = {v0, v1, v2, v3};
116 val = (vuint32m1_t)vle32_v_u32m1((
unsigned int*)v, 4);
118 unsigned get0()
const
120 return vmv_x_s_u32m1_u32(val);
128 typedef int lane_type;
132 explicit v_int32x4(vint32m1_t v) : val(v) {}
133 v_int32x4(
int v0,
int v1,
int v2,
int v3)
135 int v[] = {v0, v1, v2, v3};
136 val = (vint32m1_t)vle32_v_i32m1((
signed int*)v, 4);
140 return vmv_x_s_i32m1_i32(val);
147 typedef float lane_type;
152 v_float32x4(
float v0,
float v1,
float v2,
float v3)
154 float v[] = {v0, v1, v2, v3};
155 val = (vfloat32m1_t)vle32_v_f32m1((
float*)v, 4);
159 return vfmv_f_s_f32m1_f32(val);
170 explicit v_uint64x2(vuint64m1_t v) : val(v) {}
174 val = (vuint64m1_t)vle64_v_u64m1((
unsigned long*)v, 2);
178 return vmv_x_s_u64m1_u64(val);
185 typedef int64 lane_type;
189 explicit v_int64x2(vint64m1_t v) : val(v) {}
192 int64 v[] = {v0, v1};
193 val = (vint64m1_t)vle64_v_i64m1((
long*)v, 2);
197 return vmv_x_s_i64m1_i64(val);
204 typedef double lane_type;
211 double v[] = {v0, v1};
212 val = (vfloat64m1_t)vle64_v_f64m1((
double*)v, 2);
216 return vfmv_f_s_f64m1_f64(val);
249 inline v_int16x8 v_reinterpret_as_s16(
const v_uint8x16& v) {
return v_int16x8(vreinterpret_v_u16m1_i16m1(vreinterpret_v_u8m1_u16m1(v.val))); }
251 inline v_int32x4 v_reinterpret_as_s32(
const v_uint8x16& v) {
return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
253 inline v_int64x2 v_reinterpret_as_s64(
const v_uint8x16& v) {
return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
259 inline v_uint16x8 v_reinterpret_as_u16(
const v_int8x16& v) {
return v_uint16x8(vreinterpret_v_u8m1_u16m1(vreinterpret_v_i8m1_u8m1(v.val))); }
261 inline v_uint32x4 v_reinterpret_as_u32(
const v_int8x16& v) {
return v_uint32x4(vreinterpret_v_u8m1_u32m1(vreinterpret_v_i8m1_u8m1(v.val))); }
263 inline v_uint64x2 v_reinterpret_as_u64(
const v_int8x16& v) {
return v_uint64x2(vreinterpret_v_u8m1_u64m1(vreinterpret_v_i8m1_u8m1(v.val))); }
269 inline v_int8x16 v_reinterpret_as_s8(
const v_uint16x8& v) {
return v_int8x16(vreinterpret_v_i16m1_i8m1(vreinterpret_v_u16m1_i16m1(v.val))); }
273 inline v_int32x4 v_reinterpret_as_s32(
const v_uint16x8& v) {
return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
275 inline v_int64x2 v_reinterpret_as_s64(
const v_uint16x8& v) {
return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
283 inline v_uint32x4 v_reinterpret_as_u32(
const v_int16x8& v) {
return v_uint32x4(vreinterpret_v_u16m1_u32m1(vreinterpret_v_i16m1_u16m1(v.val))); }
285 inline v_uint64x2 v_reinterpret_as_u64(
const v_int16x8& v) {
return v_uint64x2(vreinterpret_v_u16m1_u64m1(vreinterpret_v_i16m1_u16m1(v.val))); }
291 inline v_int8x16 v_reinterpret_as_s8(
const v_uint32x4& v) {
return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_u32m1_i32m1(v.val))); }
293 inline v_int16x8 v_reinterpret_as_s16(
const v_uint32x4& v) {
return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_u32m1_i32m1(v.val))); }
297 inline v_int64x2 v_reinterpret_as_s64(
const v_uint32x4& v) {
return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
303 inline v_uint16x8 v_reinterpret_as_u16(
const v_int32x4& v) {
return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_i32m1_u32m1(v.val))); }
307 inline v_uint64x2 v_reinterpret_as_u64(
const v_int32x4& v) {
return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_i32m1_u32m1(v.val))); }
313 inline v_int8x16 v_reinterpret_as_s8(
const v_uint64x2& v) {
return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_u64m1_i64m1(v.val))); }
315 inline v_int16x8 v_reinterpret_as_s16(
const v_uint64x2& v) {
return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_u64m1_i64m1(v.val))); }
317 inline v_int32x4 v_reinterpret_as_s32(
const v_uint64x2& v) {
return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_u64m1_i64m1(v.val))); }
325 inline v_uint16x8 v_reinterpret_as_u16(
const v_int64x2& v) {
return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_i64m1_u64m1(v.val))); }
327 inline v_uint32x4 v_reinterpret_as_u32(
const v_int64x2& v) {
return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_i64m1_u64m1(v.val))); }
335 inline v_int8x16 v_reinterpret_as_s8(
const v_float32x4& v) {
return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_f32m1_i32m1(v.val))); }
337 inline v_int16x8 v_reinterpret_as_s16(
const v_float32x4& v) {
return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_f32m1_i32m1(v.val))); }
341 inline v_int64x2 v_reinterpret_as_s64(
const v_float32x4& v) {
return v_int64x2(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val))); }
343 inline v_float64x2 v_reinterpret_as_f64(
const v_float32x4& v) {
return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val)))); }
346 inline v_int8x16 v_reinterpret_as_s8(
const v_float64x2& v) {
return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_f64m1_i64m1(v.val))); }
348 inline v_int16x8 v_reinterpret_as_s16(
const v_float64x2& v) {
return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_f64m1_i64m1(v.val))); }
350 inline v_int32x4 v_reinterpret_as_s32(
const v_float64x2& v) {
return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val))); }
353 inline v_float32x4 v_reinterpret_as_f32(
const v_float64x2& v) {
return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val)))); }
356 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
357 inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); } \
358 inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
360 OPENCV_HAL_IMPL_RISCVV_INIT_SET(
uchar, uint8, u8, u8, 16)
361 OPENCV_HAL_IMPL_RISCVV_INIT_SET(
char, int8, s8, i8, 16)
362 OPENCV_HAL_IMPL_RISCVV_INIT_SET(
ushort, uint16, u16, u16, 8)
363 OPENCV_HAL_IMPL_RISCVV_INIT_SET(
short, int16, s16, i16, 8)
364 OPENCV_HAL_IMPL_RISCVV_INIT_SET(
unsigned int, uint32, u32, u32, 4)
365 OPENCV_HAL_IMPL_RISCVV_INIT_SET(
int, int32, s32, i32, 4)
366 OPENCV_HAL_IMPL_RISCVV_INIT_SET(
unsigned long,
uint64, u64, u64, 2)
367 OPENCV_HAL_IMPL_RISCVV_INIT_SET(
long,
int64, s64, i64, 2)
375 #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
376 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
378 return _Tpvec(intrin(a.val, b.val)); \
380 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
382 a.val = intrin(a.val, b.val); \
386 #define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
387 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
389 return _Tpvec(intrin(a.val, b.val, num)); \
391 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
393 a.val = intrin(a.val, b.val, num); \
397 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint8x16, vsaddu_vv_u8m1, 16)
398 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint8x16, vssubu_vv_u8m1, 16)
399 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int8x16, vsadd_vv_i8m1, 16)
400 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int8x16, vssub_vv_i8m1, 16)
401 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint16x8, vsaddu_vv_u16m1, 8)
402 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint16x8, vssubu_vv_u16m1, 8)
403 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int16x8, vsadd_vv_i16m1, 8)
404 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int16x8, vssub_vv_i16m1, 8)
405 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int32x4, vadd_vv_i32m1, 4)
406 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int32x4, vsub_vv_i32m1, 4)
407 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_int32x4, vmul_vv_i32m1, 4)
408 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint32x4, vadd_vv_u32m1, 4)
409 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint32x4, vsub_vv_u32m1, 4)
410 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_uint32x4, vmul_vv_u32m1, 4)
411 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int64x2, vadd_vv_i64m1, 2)
412 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int64x2, vsub_vv_i64m1, 2)
413 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint64x2, vadd_vv_u64m1, 2)
414 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint64x2, vsub_vv_u64m1, 2)
415 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_float32x4, vfadd_vv_f32m1, 4)
416 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_float32x4, vfsub_vv_f32m1, 4)
417 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_float32x4, vfmul_vv_f32m1, 4)
420 return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
424 a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
428 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_float64x2, vfadd_vv_f64m1, 2)
429 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_float64x2, vfsub_vv_f64m1, 2)
430 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_float64x2, vfmul_vv_f64m1, 2)
433 return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
437 a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
442 #define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
443 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
445 return _Tpvec(intrin(a.val, b.val)); \
448 #define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
449 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
451 return _Tpvec(intrin(a.val, b.val, num)); \
453 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_min, vminu_vv_u8m1, 16)
454 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
455 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_min, vmin_vv_i8m1, 16)
456 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_max, vmax_vv_i8m1, 16)
457 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_min, vminu_vv_u16m1, 8)
458 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
459 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_min, vmin_vv_i16m1, 8)
460 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_max, vmax_vv_i16m1, 8)
461 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint32x4, v_min, vminu_vv_u32m1, 4)
462 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
463 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int32x4, v_min, vmin_vv_i32m1, 4)
464 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int32x4, v_max, vmax_vv_i32m1, 4)
465 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float32x4, v_min, vfmin_vv_f32m1, 4)
466 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float32x4, v_max, vfmax_vv_f32m1, 4)
467 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float64x2, v_min, vfmin_vv_f64m1, 2)
468 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float64x2, v_max, vfmax_vv_f64m1, 2)
477 return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(
x.val, 4), 1, 4));
482 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
488 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
493 return v_float32x4(vfmadd_vv_f32m1(a.val, b.val, c.val, 4));
498 return v_int32x4(vmadd_vv_i32m1(a.val, b.val, c.val, 4));
503 return v_fma(a, b, c);
508 return v_fma(a, b, c);
515 vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);
516 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);
517 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);
518 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 3, 4), m3.val, 4);
526 vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);
527 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);
528 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);
529 res = vfadd_vv_f32m1(res, a.val, 4);
540 return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(
x.val, 2), 1, 2));
545 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
551 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
556 return v_float64x2(vfmadd_vv_f64m1(a.val, b.val, c.val, 2));
561 return v_fma(a, b, c);
564 #define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
565 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
566 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
567 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
568 inline _Tpvec operator ~ (const _Tpvec & a) \
570 return _Tpvec(vnot_v_##suffix(a.val, num)); \
573 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint8x16, u8m1, 16)
574 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint16x8, u16m1, 8)
575 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint32x4, u32m1, 4)
576 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint64x2, u64m1, 2)
577 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int8x16, i8m1, 16)
578 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int16x8, i16m1, 8)
579 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int32x4, i32m1, 4)
580 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int64x2, i64m1, 2)
582 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
583 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
585 return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
587 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
589 a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
593 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
594 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
595 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
599 return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
602 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
603 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
605 return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
607 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
609 a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
613 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
614 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
615 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
619 return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
623 return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
627 return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
640 vbool32_t
mask=vmslt_vx_i32m1_b32(
x.val, 0, 4);
641 return v_uint32x4(vreinterpret_v_i32m1_u32m1(vrsub_vx_i32m1_m(
mask,
x.val,
x.val, 0, 4)));
646 vbool16_t
mask=vmslt_vx_i16m1_b16(
x.val, 0, 8);
647 return v_uint16x8(vreinterpret_v_i16m1_u16m1(vrsub_vx_i16m1_m(
mask,
x.val,
x.val, 0, 8)));
652 vbool8_t
mask=vmslt_vx_i8m1_b8(
x.val, 0, 16);
653 return v_uint8x16(vreinterpret_v_i8m1_u8m1(vrsub_vx_i8m1_m(
mask,
x.val,
x.val, 0, 16)));
668 vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
674 vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
678 #define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
679 inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){ \
680 vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num); \
681 vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num); \
682 return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
685 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
686 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
687 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
691 vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
692 vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
693 return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
696 vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
697 vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
698 return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
701 #define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
702 inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){ \
703 vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
704 vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
705 return v_uint##_Tpvec(vreinterpret_v_i##_Tpv##_u##_Tpv(vsub_vv_i##_Tpv(max, min, num))); \
708 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
709 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
710 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
716 vint16m2_t res = vundefined_i16m2();
717 res = vwmul_vv_i16m2(a.val, b.val, 16);
718 c.val = vget_v_i16m2_i16m1(res, 0);
719 d.val = vget_v_i16m2_i16m1(res, 1);
725 vuint16m2_t res = vundefined_u16m2();
726 res = vwmulu_vv_u16m2(a.val, b.val, 16);
727 c.val = vget_v_u16m2_u16m1(res, 0);
728 d.val = vget_v_u16m2_u16m1(res, 1);
734 vint32m2_t res = vundefined_i32m2();
735 res = vwmul_vv_i32m2(a.val, b.val, 8);
736 c.val = vget_v_i32m2_i32m1(res, 0);
737 d.val = vget_v_i32m2_i32m1(res, 1);
743 vuint32m2_t res = vundefined_u32m2();
744 res = vwmulu_vv_u32m2(a.val, b.val, 8);
745 c.val = vget_v_u32m2_u32m1(res, 0);
746 d.val = vget_v_u32m2_u32m1(res, 1);
752 vint64m2_t res = vundefined_i64m2();
753 res = vwmul_vv_i64m2(a.val, b.val, 4);
754 c.val = vget_v_i64m2_i64m1(res, 0);
755 d.val = vget_v_i64m2_i64m1(res, 1);
761 vuint64m2_t res = vundefined_u64m2();
762 res = vwmulu_vv_u64m2(a.val, b.val, 4);
763 c.val = vget_v_u64m2_u64m1(res, 0);
764 d.val = vget_v_u64m2_u64m1(res, 1);
767 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
768 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
769 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
770 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
771 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
772 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
773 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
774 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
775 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
776 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
777 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
778 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
783 vuint32m2_t vindex = vundefined_u32m2();
784 vuint32m1_t vindex0 = vid_v_u32m1(4);
785 vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
786 vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
787 vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
788 vint32m2_t res = vundefined_i32m2();
789 res = vwmul_vv_i32m2(a.val, b.val, 8);
790 res = vrgather_vv_i32m2(res, vindex, 8);
791 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0), vget_v_i32m2_i32m1(res, 1), 4));
795 vuint32m2_t vindex = vundefined_u32m2();
796 vuint32m1_t vindex0 = vid_v_u32m1(4);
797 vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
798 vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
799 vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
800 vint32m2_t res = vundefined_i32m2();
801 res = vwmul_vv_i32m2(a.val, b.val, 8);
802 res = vrgather_vv_i32m2(res, vindex, 8);
803 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0),vget_v_i32m2_i32m1(res, 1), 4), c.val, 4));
809 vuint64m2_t vindex = vundefined_u64m2();
810 vuint64m1_t vindex0 = vid_v_u64m1(2);
811 vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
812 vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
813 vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
814 vint64m2_t res = vundefined_i64m2();
815 res = vwmul_vv_i64m2(a.val, b.val, 4);
816 res = vrgather_vv_i64m2(res, vindex, 4);
817 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2));
821 vuint64m2_t vindex = vundefined_u64m2();
822 vuint64m1_t vindex0 = vid_v_u64m1(2);
823 vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
824 vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
825 vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
826 vint64m2_t res = vundefined_i64m2();
827 res = vwmul_vv_i64m2(a.val, b.val, 4);
828 res = vrgather_vv_i64m2(res, vindex, 4);
829 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2), c.val, 2));
835 vuint32m4_t vindex32 = vundefined_u32m4();
836 vuint32m1_t vindex0 = vid_v_u32m1(4);
837 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
838 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
839 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
840 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
841 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
842 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
843 vuint16m2_t v1 = vundefined_u16m2();
844 vuint32m2_t v2 = vundefined_u32m2();
845 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
846 v1 = vrgather_vv_u16m2(v1, vindex, 16);
847 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
848 return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
854 vuint32m4_t vindex32 = vundefined_u32m4();
855 vuint32m1_t vindex0 = vid_v_u32m1(4);
856 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
857 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
858 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
859 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
860 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
861 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
862 vuint16m2_t v1 = vundefined_u16m2();
863 vuint32m2_t v2 = vundefined_u32m2();
864 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
865 v1 = vrgather_vv_u16m2(v1, vindex, 16);
866 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
867 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
872 vuint32m4_t vindex32 = vundefined_u32m4();
873 vuint32m1_t vindex0 = vid_v_u32m1(4);
874 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
875 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
876 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
877 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
878 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
879 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
880 vint16m2_t v1 = vundefined_i16m2();
881 vint32m2_t v2 = vundefined_i32m2();
882 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
883 v1 = vrgather_vv_i16m2(v1, vindex, 16);
884 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
885 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
891 vuint32m4_t vindex32 = vundefined_u32m4();
892 vuint32m1_t vindex0 = vid_v_u32m1(4);
893 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
894 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
895 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
896 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
897 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
898 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
899 vint16m2_t v1 = vundefined_i16m2();
900 vint32m2_t v2 = vundefined_i32m2();
901 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
902 v1 = vrgather_vv_i16m2(v1, vindex, 16);
903 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
904 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
909 vuint64m4_t vindex64 = vundefined_u64m4();
910 vuint64m1_t vindex0 = vid_v_u64m1(2);
911 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
912 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
913 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
914 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
915 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
916 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
917 vuint32m2_t v1 = vundefined_u32m2();
918 vuint64m2_t v2 = vundefined_u64m2();
919 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
920 v1 = vrgather_vv_u32m2(v1, vindex, 8);
921 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
922 return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
928 vuint64m4_t vindex64 = vundefined_u64m4();
929 vuint64m1_t vindex0 = vid_v_u64m1(2);
930 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
931 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
932 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
933 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
934 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
935 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
936 vuint32m2_t v1 = vundefined_u32m2();
937 vuint64m2_t v2 = vundefined_u64m2();
938 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
939 v1 = vrgather_vv_u32m2(v1, vindex, 8);
940 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
941 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
946 vuint64m4_t vindex64 = vundefined_u64m4();
947 vuint64m1_t vindex0 = vid_v_u64m1(2);
948 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
949 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
950 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
951 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
952 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
953 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
954 vint32m2_t v1 = vundefined_i32m2();
955 vint64m2_t v2 = vundefined_i64m2();
956 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
957 v1 = vrgather_vv_i32m2(v1, vindex, 8);
958 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
959 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
965 vuint64m4_t vindex64 = vundefined_u64m4();
966 vuint64m1_t vindex0 = vid_v_u64m1(2);
967 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
968 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
969 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
970 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
971 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
972 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
973 vint32m2_t v1 = vundefined_i32m2();
974 vint64m2_t v2 = vundefined_i64m2();
975 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
976 v1 = vrgather_vv_i32m2(v1, vindex, 8);
977 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
978 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
985 vint32m2_t v1 = vundefined_i32m2();
986 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
987 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4));
992 vint32m2_t v1 = vundefined_i32m2();
993 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
994 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4), c.val, 4));
1000 vint64m2_t v1 = vundefined_i64m2();
1001 v1 = vwmul_vv_i64m2(a.val, b.val, 4);
1002 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2));
1006 vint64m2_t v1 = vundefined_i64m2();
1007 v1 = vwmul_vv_i64m2(a.val, b.val, 8);
1008 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 4), c.val, 4));
1014 vuint16m2_t v1 = vundefined_u16m2();
1015 vuint32m2_t v2 = vundefined_u32m2();
1016 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
1017 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
1018 return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
1023 vuint16m2_t v1 = vundefined_u16m2();
1024 vuint32m2_t v2 = vundefined_u32m2();
1025 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
1026 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
1027 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
1032 vint16m2_t v1 = vundefined_i16m2();
1033 vint32m2_t v2 = vundefined_i32m2();
1034 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
1035 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
1036 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
1040 vint16m2_t v1 = vundefined_i16m2();
1041 vint32m2_t v2 = vundefined_i32m2();
1042 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
1043 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
1044 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
1050 vuint32m2_t v1 = vundefined_u32m2();
1051 vuint64m2_t v2 = vundefined_u64m2();
1052 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
1053 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
1054 return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
1058 vuint32m2_t v1 = vundefined_u32m2();
1059 vuint64m2_t v2 = vundefined_u64m2();
1060 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
1061 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
1062 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
1067 vint32m2_t v1 = vundefined_i32m2();
1068 vint64m2_t v2 = vundefined_i64m2();
1069 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
1070 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
1071 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
1075 vint32m2_t v1 = vundefined_i32m2();
1076 vint64m2_t v2 = vundefined_i64m2();
1077 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
1078 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
1079 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
1083 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
1084 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
1086 v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
1087 val = intrin(val, a.val, val, num); \
1088 return vmv_x_s_##len##m1_##len(val); \
1092 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num, scalerfunc) \
1093 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
1095 v##_Tpvec##m1_t val = vundefined_##_Tpvec2##m1(); \
1096 val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num); \
1097 return scalerfunc(val); \
1099 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16,
int,
sum, vwredsum_vs_i8m1_i16m1, 16)
1100 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32,
int,
sum, vwredsum_vs_i16m1_i32m1, 8)
1101 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32,
int64, i64,
int,
sum, vwredsum_vs_i32m1_i64m1, 4)
1102 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16,
unsigned,
sum, vwredsumu_vs_u8m1_u16m1, 16)
1103 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32,
unsigned,
sum, vwredsumu_vs_u16m1_u32m1, 8)
1104 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32,
uint64, u64,
unsigned,
sum, vwredsumu_vs_u32m1_u64m1, 4)
1107 vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
1108 val = vfredosum_vs_f32m1_f32m1(val, a.val, val, 4); \
1109 return vfmv_f_s_f32m1_f32(val); \
1113 vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
1114 val = vfredosum_vs_f64m1_f64m1(val, a.val, val, 2); \
1115 return vfmv_f_s_f64m1_f64(val); \
1118 { vuint64m1_t res = vundefined_u64m1();
return vmv_x_s_u64m1_u64(vredsum_vs_u64m1_u64m1(res, a.val, vmv_v_x_u64m1(0, 2), 2)); }
1121 { vint64m1_t res = vundefined_i64m1();
return vmv_x_s_i64m1_i64(vredsum_vs_i64m1_i64m1(res, a.val, vmv_v_x_i64m1(0, 2), 2)); }
1123 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func) \
1124 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16, vmv_x_s_i8m1_i8) \
1125 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8, vmv_x_s_i16m1_i16) \
1126 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4, vmv_x_s_i32m1_i32) \
1127 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2, vmv_x_s_i64m1_i64) \
1128 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16, vmv_x_s_u8m1_u8) \
1129 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8, vmv_x_s_u16m1_u16) \
1130 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4, vmv_x_s_u32m1_u32) \
1131 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4, vfmv_f_s_f32m1_f32)
1132 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(
max)
1133 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(
min)
1138 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
1139 vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
1140 vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
1141 vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
1142 a0 = vfredosum_vs_f32m1_f32m1(a0, a.val, a0, 4);
1143 b0 = vfredosum_vs_f32m1_f32m1(b0, b.val, b0, 4);
1144 c0 = vfredosum_vs_f32m1_f32m1(c0, c.val, c0, 4);
1145 d0 = vfredosum_vs_f32m1_f32m1(d0, d.val, d0, 4);
1147 res = vslideup_vx_f32m1(a0, b0, 1, 4);
1148 res = vslideup_vx_f32m1(res, c0, 2, 4);
1149 res = vslideup_vx_f32m1(res, d0, 3, 4);
1155 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
1156 vfloat32m1_t
x = vfsub_vv_f32m1(a.val, b.val, 4);
1157 vbool32_t
mask=vmflt_vf_f32m1_b32(
x, 0, 4);
1158 vfloat32m1_t val = vfrsub_vf_f32m1_m(
mask,
x,
x, 0, 4);
1159 a0 = vfredosum_vs_f32m1_f32m1(a0, val, a0, 4);
1160 return vfmv_f_s_f32m1_f32(a0);
1163 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
1164 inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){ \
1165 _Tpvec2 x = v_absdiff(a, b); \
1166 return v_reduce_sum(x); \
1176 #define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
1177 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1179 vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
1180 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1182 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1184 vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
1185 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1187 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1189 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
1190 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1192 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1194 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
1195 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1197 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1199 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
1200 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1202 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1204 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
1205 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1208 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int8x16, i8m1, 8, 16, _vv_)
1209 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int16x8, i16m1, 16, 8, _vv_)
1210 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int32x4, i32m1, 32, 4, _vv_)
1211 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int64x2, i64m1, 64, 2, _vv_)
1212 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint8x16, u8m1, 8, 16, u_vv_)
1213 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint16x8, u16m1, 16, 8, u_vv_)
1214 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint32x4, u32m1, 32, 4, u_vv_)
1215 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint64x2, u64m1, 64, 2, u_vv_)
1220 vbool32_t
mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
1221 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1222 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1226 vbool32_t
mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
1227 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1228 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1232 vbool32_t
mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
1233 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1234 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1238 vbool32_t
mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
1239 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1240 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1244 vbool32_t
mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
1245 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1246 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1250 vbool32_t
mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
1251 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1252 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1256 vbool32_t
mask = vmfeq_vv_f32m1_b32(a.val, a.val, 4);
1257 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1258 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1264 vbool64_t
mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
1265 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1266 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1270 vbool64_t
mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
1271 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1272 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1276 vbool64_t
mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
1277 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1278 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1282 vbool64_t
mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
1283 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1284 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1288 vbool64_t
mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
1289 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1290 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1294 vbool64_t
mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
1295 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1296 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1300 vbool64_t
mask = vmfeq_vv_f64m1_b64(a.val, a.val, 2);
1301 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1302 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1304 #define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
1305 inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
1306 const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
1307 v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
1308 v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
1310 vuint32m4_t vindex = vundefined_u32m4(); \
1311 vuint32m1_t vindex0 = vid_v_u32m1(4); \
1312 vindex0 = vsll_vx_u32m1(vindex0, 2, 4); \
1313 vindex = vset_v_u32m1_u32m4(vindex, 0, vindex0); \
1314 vindex = vset_v_u32m1_u32m4(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); \
1315 vindex = vset_v_u32m1_u32m4(vindex, 2, vadd_vx_u32m1(vindex0, 2, 4)); \
1316 vindex = vset_v_u32m1_u32m4(vindex, 3, vadd_vx_u32m1(vindex0, 3, 4)); \
1317 v##_Tp##32m4_t val = vundefined_##_T##m4(); \
1318 val = vset_v_##_T##m1_##_T##m4(val, 0, a0.val); \
1319 val = vset_v_##_T##m1_##_T##m4(val, 1, a1.val); \
1320 val = vset_v_##_T##m1_##_T##m4(val, 2, a2.val); \
1321 val = vset_v_##_T##m1_##_T##m4(val, 3, a3.val); \
1322 val = vrgather_vv_##_T##m4(val, vindex, 16); \
1323 b0.val = vget_v_##_T##m4_##_T##m1(val, 0); \
1324 b1.val = vget_v_##_T##m4_##_T##m1(val, 1); \
1325 b2.val = vget_v_##_T##m4_##_T##m1(val, 2); \
1326 b3.val = vget_v_##_T##m4_##_T##m1(val, 3); \
1328 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(
uint, u32)
1329 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(
int, i32)
1330 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(
float, f32)
1333 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
1334 inline _Tpvec operator << (const _Tpvec& a, int n) \
1335 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
1336 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1337 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
1339 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
1340 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1341 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
1342 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1343 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
1344 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1345 { return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
1348 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
1349 OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
1350 OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
1352 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
1353 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
1354 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
1355 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(
uint64, u64, 2, srl)
1356 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
1357 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
1358 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
1359 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(
int64, i64, 2, sra)
1362 #define VUP4(n) {0, 1, 2, 3}
1363 #define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
1364 #define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
1365 #define VUP2(n) {0, 1}
1367 #define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
1368 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1370 suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
1371 tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
1372 return _Tpvec(tmp);\
1374 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1376 suffix##m1_t res = vundefined_##_T##m1(); \
1377 return _Tpvec(vslidedown_vx_##_T##m1(res, a.val, n, num));\
1379 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1381 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1383 suffix##m2_t tmp = vundefined_##_T##m2(); \
1384 suffix##m2_t res = vundefined_##_T##m2(); \
1385 tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a.val); \
1386 tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, b.val); \
1387 res = vslidedown_vx_##_T##m2(res, tmp, n, num2);\
1388 return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 0));\
1390 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1392 suffix##m2_t tmp = vundefined_##_T##m2(); \
1393 suffix##m2_t res = vundefined_##_T##m2(); \
1394 tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, b.val); \
1395 tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a.val); \
1396 res = vslideup_vx_##_T##m2(res, tmp, n, num2);\
1397 return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 1));\
1399 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1401 CV_UNUSED(b); return a; \
1404 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
1405 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
1406 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
1407 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
1408 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
1409 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
1410 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
1411 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
1412 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
1413 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
1416 #define vreinterpret_v_i8m1_i8m1
1417 #define vreinterpret_v_u8m1_u8m1
1418 #define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize, ldst_len, ldst_type) \
1419 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1421 _Tp2##_t res = vundefined_##len(); \
1422 _Tp2##_t res1 = vundefined_##len(); \
1423 res = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr0, 8)); \
1424 res1 = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr1, 8)); \
1425 res = vslideup_vx_##len(res, res1, hnum, num); \
1426 return _Tpvec(res); } \
1427 inline _Tpvec v_load_low(const _Tp* ptr) \
1428 { return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 8))); }\
1429 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1430 { return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
1431 inline _Tpvec v_load(const _Tp* ptr) \
1432 { return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1433 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1434 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
1435 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1437 _Tp2##_t a0 = vundefined_##len(); \
1438 a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1439 vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
1440 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1441 { vse##elemsize##_v_##len(ptr, a.val, num); } \
1442 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1443 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
1444 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1445 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
1446 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
1447 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); }
1450 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int8x16,
schar, vint8m1, i8m1, 8, 16, 8, i8m1,
schar)
1452 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int16x8,
short, vint16m1, i16m1, 4, 8, 16, i8m1,
schar)
1453 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint32x4,
unsigned, vuint32m1, u32m1, 2, 4, 32, u8m1,
uchar)
1454 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int32x4,
int, vint32m1, i32m1, 2, 4, 32, i8m1,
schar)
1455 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint64x2,
unsigned long, vuint64m1, u64m1, 1, 2, 64, u8m1,
uchar)
1456 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int64x2,
long, vint64m1, i64m1, 1, 2, 64, i8m1,
schar)
1458 #define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
1459 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1461 _Tp2##_t res = vundefined_##len(); \
1462 _Tp2##_t res1 = vundefined_##len(); \
1463 res = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr0, 8))); \
1464 res1 = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr1, 8))); \
1465 res = vslideup_vx_##len(res, res1, hnum, num); \
1466 return _Tpvec(res); } \
1467 inline _Tpvec v_load_low(const _Tp* ptr) \
1468 { return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 8)))); }\
1469 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1470 { return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
1471 inline _Tpvec v_load(const _Tp* ptr) \
1472 { return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1473 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1474 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
1475 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1477 _Tp2##_t a0 = vundefined_##len(); \
1478 a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1479 vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
1480 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1481 { vse##elemsize##_v_##len(ptr, a.val, num); } \
1482 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1483 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
1484 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1485 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
1486 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
1487 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); }
1488 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(
v_float32x4,
float, vfloat32m1, f32m1, 2, 4, 32)
1489 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(
v_float64x2,
double, vfloat64m1, f64m1, 1, 2, 64)
1493 #define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
1494 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1496 _Tp2##_t res, res1; \
1497 res = vle##elemsize##_v_##len(ptr0, hnum); \
1498 res1 = vle##elemsize##_v_##len(ptr1, hnum); \
1499 res = vslideup_vx_##len(res, res1, hnum, num); \
1500 return _Tpvec(res); } \
1501 inline _Tpvec v_load_low(const _Tp* ptr) \
1502 { return _Tpvec(vle##elemsize##_v_##len(ptr, hnum)); }\
1503 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1504 { return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1505 inline _Tpvec v_load(const _Tp* ptr) \
1506 { return _Tpvec((_Tp2##_t)vle##elemsize##_v_##len((const _Tp *)ptr, num)); } \
1507 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1508 { vse##elemsize##_v_##len(ptr, a.val, hnum);}\
1509 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1512 a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1513 vse##elemsize##_v_##len(ptr, a0, hnum);}\
1514 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1515 { vse##elemsize##_v_##len(ptr, a.val, num); } \
1516 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1517 { vse##elemsize##_v_##len(ptr, a.val, num); } \
1518 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1519 { vse##elemsize##_v_##len(ptr, a.val, num); } \
1520 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
1521 { vse##elemsize##_v_##len(ptr, a.val, num); }
1523 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint8x16,
uchar, vuint8m1, u8m1, 8, 16, 8)
1524 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int8x16,
schar, vint8m1, i8m1, 8, 16, 8)
1525 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint16x8,
ushort, vuint16m1, u16m1, 4, 8, 16)
1526 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int16x8,
short, vint16m1, i16m1, 4, 8, 16)
1527 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint32x4,
unsigned, vuint32m1, u32m1, 2, 4, 32)
1528 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int32x4,
int, vint32m1, i32m1, 2, 4, 32)
1529 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint64x2,
unsigned long, vuint64m1, u64m1, 1, 2, 64)
1530 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int64x2,
long, vint64m1, i64m1, 1, 2, 64)
1531 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_float32x4,
float, vfloat32m1, f32m1, 2, 4, 32)
1532 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_float64x2,
double, vfloat64m1, f64m1, 1, 2, 64)
1560 return v_int8x16(vle8_v_i8m1(elems, 16));
1562 #if __riscv_v == 7000
1563 return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((
const int *)tab, vle32_v_u32m4((
unsigned int *)
idx, 16), 16), 0, 16), 0, 16));
1565 return v_int8x16(vloxei32_v_i8m1(tab, vle32_v_u32m4((
unsigned int *)
idx, 16), 16));
1591 return v_int8x16(vle8_v_i8m1(elems, 16));
1594 vuint32m4_t vidx = vle32_v_u32m4((
unsigned int *)
idx, 8);
1595 seq = vid_v_u32m4(16);
1597 vidx = vrgather_vv_u32m4(vidx,
index, 16);
1598 index = vadd_vv_u32m4(vand_vx_u32m4(
seq, 1, 16), vidx, 16);
1599 #if __riscv_v == 7000
1600 return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((
const int *)tab,
index, 16), 0, 16), 0, 16));
1628 return v_int8x16(vle8_v_i8m1(elems, 16));
1631 vuint32m4_t vidx = vle32_v_u32m4((
unsigned int *)
idx, 4);
1632 seq = vid_v_u32m4(16);
1634 vidx = vrgather_vv_u32m4(vidx,
index, 16);
1635 seq = vset_v_u32m1_u32m4(
seq, 1, vget_v_u32m4_u32m1(
seq, 0));
1636 seq = vset_v_u32m1_u32m4(
seq, 2, vget_v_u32m4_u32m1(
seq, 0));
1637 seq = vset_v_u32m1_u32m4(
seq, 3, vget_v_u32m4_u32m1(
seq, 0));
1638 index = vadd_vv_u32m4(
seq, vidx, 16);
1639 #if __riscv_v == 7000
1640 return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((
const int *)tab,
index, 16), 0, 16), 0, 16));
1665 return v_int16x8(vle16_v_i16m1(elems, 8));
1667 #if __riscv_v == 7000
1668 return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((
const int *)tab, vsll_vx_u32m2(vle32_v_u32m2((
unsigned int *)
idx, 8), 1, 8), 8), 0, 8));
1670 return v_int16x8(vloxei32_v_i16m1(tab, vsll_vx_u32m2(vle32_v_u32m2((
unsigned int *)
idx, 8), 1, 8), 8));
1688 return v_int16x8(vle16_v_i16m1(elems, 8));
1691 vuint32m2_t vidx = vle32_v_u32m2((
unsigned int *)
idx, 4);
1692 seq = vid_v_u32m2(8);
1694 vidx = vrgather_vv_u32m2(vidx,
index, 8);
1695 index = vsll_vx_u32m2(vadd_vv_u32m2(vand_vx_u32m2(
seq, 1, 8), vidx, 8), 1, 8);
1696 #if __riscv_v == 7000
1697 return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((
const int *)tab,
index, 8), 0, 8));
1717 return v_int16x8(vle16_v_i16m1(elems, 8));
1720 vuint32m2_t vidx = vle32_v_u32m2((
unsigned int *)
idx, 2);
1721 seq = vid_v_u32m2(8);
1723 vidx = vrgather_vv_u32m2(vidx,
index, 8);
1724 seq = vset_v_u32m1_u32m2(
seq, 1, vget_v_u32m2_u32m1(
seq, 0));
1725 index = vsll_vx_u32m2(vadd_vv_u32m2(
seq, vidx, 8), 1, 8);
1726 #if __riscv_v == 7000
1727 return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((
const int *)tab,
index, 8), 0, 8));
1747 return v_int32x4(vle32_v_i32m1(elems, 4));
1749 return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((
unsigned int *)
idx, 4), 2, 4), 4));
1762 return v_int32x4(vle32_v_i32m1(elems, 4));
1765 vuint32m1_t vidx = vle32_v_u32m1((
unsigned int *)
idx, 2);
1766 seq = vid_v_u32m1(4);
1768 vidx = vrgather_vv_u32m1(vidx,
index, 4);
1769 index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(
seq, 1, 4), vidx, 4), 2, 4);
1784 return v_int64x2(vloxei64_v_i64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)
idx, 2), 0, 2), 0), 3, 2), 2));
1794 return v_uint64x2(vloxei64_v_u64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)
idx, 2), 0, 2), 0), 3, 2), 2));
1813 return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((
unsigned int *)
idx, 4), 2, 4), 4));
1829 vuint32m1_t vidx = vle32_v_u32m1((
unsigned int *)
idx, 2);
1830 seq = vid_v_u32m1(4);
1832 vidx = vrgather_vv_u32m1(vidx,
index, 4);
1833 index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(
seq, 1, 4), vidx, 4), 2, 4);
1844 return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)
idx, 2), 0, 2), 0), 3, 2), 2));
1860 return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1872 return v_uint32x4(vloxei32_v_u32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1884 return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1889 return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vreinterpret_v_i64m1_u64m1(vget_v_i64m2_i64m1(vwadd_vx_i64m2(idxvec.val, 0, 2), 0)), 3, 2), 2));
1893 vint32m1_t
index = vmul_vx_i32m1(idxvec.val, 4, 4);
1898 vloxseg2ei32_v_f32m1(&
x.val, &
y.val, tab, vreinterpret_v_i32m1_u32m1(
index), 4);
1910 #define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type, elemsize) \
1911 inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1913 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1914 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1915 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val); \
1916 return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
1918 template<int n> inline \
1919 v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1921 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1922 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1923 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val); \
1924 return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
1926 inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1928 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1929 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1930 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1931 asm("" ::: "memory"); \
1932 vse##elemsize##_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
1934 template<int n> inline \
1935 void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1937 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1938 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1939 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1940 vse##elemsize##_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
1942 OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_wx, vnclip_wx,
signed char, 8)
1943 OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_wx, vnclip_wx,
signed short, 16)
1944 OPENCV_HAL_IMPL_RISCVV_PACKS(int32,
int64, i64, 2, i32, 4, vnclip_wx, vnsra_wx,
int, 32)
1945 OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_wx, vnclipu_wx,
unsigned char, 8)
1946 OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_wx, vnclipu_wx,
unsigned short, 16)
1947 OPENCV_HAL_IMPL_RISCVV_PACKS(uint32,
uint64, u64, 2, u32, 4, vnclipu_wx, vnsrl_wx,
unsigned int, 32)
1952 vuint16m2_t tmp = vundefined_u16m2(); \
1953 tmp = vset_v_u16m1_u16m2(tmp, 0, a.val); \
1954 tmp = vset_v_u16m1_u16m2(tmp, 1, b.val); \
1955 return
v_uint8x16(vnsrl_wx_u8m1(tmp, 0, 16));
1961 vuint32m4_t vabcd = vundefined_u32m4(); \
1962 vuint16m2_t v16 = vundefined_u16m2(); \
1963 vabcd = vset_v_u32m1_u32m4(vabcd, 0, a.val); \
1964 vabcd = vset_v_u32m1_u32m4(vabcd, 1, b.val); \
1965 vabcd = vset_v_u32m1_u32m4(vabcd, 2, c.val); \
1966 vabcd = vset_v_u32m1_u32m4(vabcd, 3, d.val); \
1967 v16 = vnsrl_wx_u16m2(vabcd, 0, 16);
1968 return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
1975 vuint64m8_t v64 = vundefined_u64m8(); \
1976 vuint32m4_t v32 = vundefined_u32m4(); \
1977 vuint16m2_t v16 = vundefined_u16m2(); \
1978 v64 = vset_v_u64m1_u64m8(v64, 0, a.val); \
1979 v64 = vset_v_u64m1_u64m8(v64, 1, b.val); \
1980 v64 = vset_v_u64m1_u64m8(v64, 2, c.val); \
1981 v64 = vset_v_u64m1_u64m8(v64, 3, d.val); \
1982 v64 = vset_v_u64m1_u64m8(v64, 4, e.val); \
1983 v64 = vset_v_u64m1_u64m8(v64, 5, f.val); \
1984 v64 = vset_v_u64m1_u64m8(v64, 6, g.val); \
1985 v64 = vset_v_u64m1_u64m8(v64, 7, h.val); \
1986 v32 = vnsrl_wx_u32m4(v64, 0, 16);
1987 v16 = vnsrl_wx_u16m2(v32, 0, 16);
1988 return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
2000 #define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
2001 inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
2003 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2004 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2005 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val); \
2006 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2007 return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1)); \
2009 inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
2011 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2012 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2013 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2014 return vse##tp1##_v_u##tp1##m1(ptr, vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1), num2); \
2016 template<int n> inline \
2017 v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
2019 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2020 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2021 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val); \
2022 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2023 return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), n, num1)); \
2025 template<int n> inline \
2026 void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
2028 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2029 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2030 vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2031 vuint##tp1##m1_t val = vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val_), n, num1); \
2032 return vse##tp1##_v_u##tp1##m1(ptr, val, num2);\
2034 OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8,
unsigned char )
2035 OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4,
unsigned short)
2039 #define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt) \
2040 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
2042 auto res = mul(a.val, b.val, num); \
2043 return _Tpvec(cvt(res, 0, num)); \
2045 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
2046 { a = a * b; return a; }
2048 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(
v_int8x16, 16, vwmul_vv_i16m2, vnclip_wx_i8m1)
2049 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(
v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
2050 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(
v_int16x8, 32, vwmul_vv_i32m2, vnclip_wx_i16m1)
2051 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(
v_uint16x8, 32, vwmulu_vv_u32m2, vnclipu_wx_u16m1)
2054 static const signed char popCountTable[256] =
2056 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2057 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2058 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2059 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2060 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2061 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2062 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2063 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2064 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2065 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2066 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2067 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2068 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2069 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2070 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2071 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2074 inline vuint8m1_t vcnt_u8(vuint8m1_t val){
2075 #if __riscv_v == 7000
2076 vuint8m1_t v0 = vand_vx_u8m1(val, 1, 16);
2077 return vadd_vv_u8m1(vloxei8_v_u8m1((
unsigned char*)popCountTable, vsrl_vx_u8m1(val, 1, 16), 16), v0, 16);
2079 return vloxei8_v_u8m1((
unsigned char*)popCountTable, val, 16);
2092 return v_uint8x16(vcnt_u8(vreinterpret_v_i8m1_u8m1(a.val)));
2098 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u16m1_u8m1(a.val));
2099 vuint8m1_t
seq = vid_v_u8m1(8);
2100 vuint8m1_t
index = vsll_vx_u8m1(
seq, 1, 8);
2101 return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp,
index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(
index, 1, 8), 8), 8), 0));
2107 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(a.val)));
2108 vuint8m1_t
seq = vid_v_u8m1(8);
2109 vuint8m1_t
index = vsll_vx_u8m1(
seq, 1, 8);
2110 return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp,
index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(
index, 1, 8), 8), 8), 0));
2116 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u32m1_u8m1(a.val));
2117 vuint8m1_t
seq = vid_v_u8m1(8);
2118 vuint8m1_t
index = vsll_vx_u8m1(
seq, 1, 8);
2119 vuint8m1_t
sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp,
index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(
index, 1, 8), 8), 8);
2120 return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(
sum,
index, 4), vrgather_vv_u8m1(
sum, vadd_vx_u8m1(
index, 1, 4), 4), 4), 0, 4), 0));
2126 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(a.val)));
2127 vuint8m1_t
seq = vid_v_u8m1(8);
2128 vuint8m1_t
index = vsll_vx_u8m1(
seq, 1, 8);
2129 vuint8m1_t
sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp,
index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(
index, 1, 8), 8), 8);
2130 return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(
sum,
index, 4), vrgather_vv_u8m1(
sum, vadd_vx_u8m1(
index, 1, 4), 4), 4), 0, 4), 0));
2136 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u64m1_u8m1(a.val));
2137 vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
2138 vuint16m1_t res1 = vundefined_u16m1();
2139 vuint16m1_t res2 = vundefined_u16m1();
2140 res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
2141 res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
2142 return v_uint64x2((
unsigned long)vmv_x_s_u16m1_u16(res1), (
unsigned long)vmv_x_s_u16m1_u16(res2));
2148 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(a.val)));
2149 vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
2150 vuint16m1_t res1 = vundefined_u16m1(), res2 = vundefined_u16m1();
2151 res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
2152 res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
2153 return v_uint64x2((
unsigned long)vmv_x_s_u16m1_u16(res1), (
unsigned long)vmv_x_s_u16m1_u16(res2));
2156 #define SMASK 1, 2, 4, 8, 16, 32, 64, 128
2159 vuint16m1_t res = vundefined_u16m1();
2160 vuint8m1_t
id = vid_v_u8m1(16);
2161 vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(
id, 0, 16), 16);
2162 vuint8m1_t t0 = vsrl_vx_u8m1(a.val, 7, 16);
2163 vbool8_t
mask = vmseq_vx_u8m1_b8(t0, 1, 16);
2164 res = vredsum_vs_u16m2_u16m1_m(
mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2165 return vmv_x_s_u16m1_u16(res);
2169 vuint16m1_t res = vundefined_u16m1();
2170 vuint8m1_t
id = vid_v_u8m1(16);
2171 vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(
id, 0, 16), 16);
2172 vbool8_t
mask = vmslt_vx_i8m1_b8(a.val, 0, 16);
2173 res = vredsum_vs_u16m2_u16m1_m(
mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2174 return vmv_x_s_u16m1_u16(res);
2179 vuint16m1_t res = vundefined_u16m1();
2180 vuint16m1_t
id = vid_v_u16m1(8);
2181 vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8),
id, 8);
2182 vbool16_t
mask = vmslt_vx_i16m1_b16(a.val, 0, 8);
2183 res = vredsum_vs_u16m1_u16m1_m(
mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2184 return vmv_x_s_u16m1_u16(res);
2188 vuint16m1_t res = vundefined_u16m1();
2189 vuint16m1_t
id = vid_v_u16m1(8);
2190 vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8),
id, 8);
2191 vuint16m1_t t0 = vsrl_vx_u16m1(a.val, 15, 8);
2192 vbool16_t
mask = vmseq_vx_u16m1_b16(t0, 1, 8);
2193 res = vredsum_vs_u16m1_u16m1_m(
mask, res, num, vmv_v_x_u16m1(0, 8), 8);
2194 return vmv_x_s_u16m1_u16(res);
2198 vuint32m1_t res = vundefined_u32m1();
2199 vuint32m1_t
id = vid_v_u32m1(4);
2200 vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4),
id, 4);
2201 vbool32_t
mask = vmslt_vx_i32m1_b32(a.val, 0, 4);
2202 res = vredsum_vs_u32m1_u32m1_m(
mask, res, num, vmv_v_x_u32m1(0, 4), 4);
2203 return vmv_x_s_u32m1_u32(res);
2207 vuint32m1_t res = vundefined_u32m1();
2208 vuint32m1_t
id = vid_v_u32m1(4);
2209 vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4),
id, 4);
2210 vuint32m1_t t0 = vsrl_vx_u32m1(a.val, 31, 4);
2211 vbool32_t
mask = vmseq_vx_u32m1_b32(t0, 1, 4);
2212 res = vredsum_vs_u32m1_u32m1_m(
mask, res, num, vmv_v_x_u32m1(0, 4), 4);
2213 return vmv_x_s_u32m1_u32(res);
2217 vuint64m1_t res = vundefined_u64m1();
2218 vuint64m1_t
id = vid_v_u64m1(2);
2219 vuint64m1_t num = vsll_vv_u64m1(vmv_v_x_u64m1(1, 2),
id, 2);
2220 vuint64m1_t t0 = vsrl_vx_u64m1(a.val, 63, 2);
2221 vbool64_t
mask = vmseq_vx_u64m1_b64(t0, 1, 2);
2222 res = vredsum_vs_u64m1_u64m1_m(
mask, res, num, vmv_v_x_u64m1(0, 2), 2);
2223 return vmv_x_s_u64m1_u64(res);
2226 {
return v_signmask(v_reinterpret_as_u64(a)); }
2228 {
return v_signmask(v_reinterpret_as_u64(a)); }
2243 if(val==0)
return 0;
2244 else return trailingZeros32(val); }
2247 if(val==0)
return 0;
2248 else return trailingZeros32(val); }
2251 if(val==0)
return 0;
2252 else return trailingZeros32(val); }
2255 if(val==0)
return 0;
2256 else return trailingZeros32(val); }
2259 if(val==0)
return 0;
2260 else return trailingZeros32(val); }
2263 if(val==0)
return 0;
2264 else return trailingZeros32(val); }
2267 if(val==0)
return 0;
2268 else return trailingZeros32(val); }
2271 if(val==0)
return 0;
2272 else return trailingZeros32(val); }
2275 if(val==0)
return 0;
2276 else return trailingZeros32(val); }
2278 #define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num, mask_b) \
2279 inline bool v_check_all(const v_##_Tpvec& a) \
2281 suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
2282 return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) == 0; \
2284 inline bool v_check_any(const v_##_Tpvec& a) \
2286 suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
2287 return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) != 0; \
2290 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16, b8)
2291 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8, b16)
2292 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4, b32)
2293 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2, b64)
2321 #define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num, mask_func) \
2322 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
2324 return _Tpvec(vmerge_vvm_##suffix(mask_func(mask.val, 0, num), b.val, a.val, num)); \
2327 OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int8x16, i8m1, vbool8_t, 16, vmsne_vx_i8m1_b8)
2328 OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int16x8, i16m1, vbool16_t, 8, vmsne_vx_i16m1_b16)
2329 OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int32x4, i32m1, vbool32_t, 4, vmsne_vx_i32m1_b32)
2330 OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint8x16, u8m1, vbool8_t, 16, vmsne_vx_u8m1_b8)
2331 OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint16x8, u16m1, vbool16_t, 8, vmsne_vx_u16m1_b16)
2332 OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint32x4, u32m1, vbool32_t, 4, vmsne_vx_u32m1_b32)
2335 return v_float32x4(vmerge_vvm_f32m1(vmfne_vf_f32m1_b32(
mask.val, 0, 4), b.val, a.val, 4));
2339 return v_float64x2(vmerge_vvm_f64m1(vmfne_vf_f64m1_b64(
mask.val, 0, 2), b.val, a.val, 2));
2342 #define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2, num3) \
2343 inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
2345 _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1); \
2346 b0.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 0); \
2347 b1.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 1); \
2349 inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
2351 _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num2); \
2352 return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
2354 inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
2356 _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1); \
2357 return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 1)); \
2359 inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
2361 _T2##_t val = vle##num3##_v_##_Tp1(ptr, num2); \
2362 _T1##_t b = vw##add##_vx_##_Tp2##m2(val, 0, num2); \
2363 return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
2366 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint8x16, uint16x8,
uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1, 8)
2367 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint16x8, uint32x4,
ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1, 16)
2368 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint32x4, uint64x2,
uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1, 32)
2369 OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int8x16, int16x8,
schar, i8m1, 16, i16, 8, vint16m2, vint8m1, 8)
2370 OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int16x8, int32x4,
short, i16m1, 8, i32, 4, vint32m2, vint16m1, 16)
2371 OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int32x4, int64x2,
int, i32m1, 4, i64, 2, vint64m2, vint32m1, 32)
2375 vuint16m2_t b = vundefined_u16m2();
2376 vuint32m2_t c = vundefined_u32m2();
2377 vuint8m1_t val = vle8_v_u8m1(ptr, 4); \
2378 b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4); \
2379 c = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \
2385 vint16m2_t b = vundefined_i16m2();
2386 vint32m2_t c = vundefined_i32m2();
2387 vint8m1_t val = vle8_v_i8m1(ptr, 4); \
2388 b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4); \
2389 c = vwadd_vv_i32m2(vget_v_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \
2390 return
v_int32x4(vget_v_i32m2_i32m1(c, 0));
2392 #define VITL_16 {0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
2393 #define VITL_8 {0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
2394 #define VITL_4 {0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
2395 #define VITL_2 {0, 0, 2, 0, 1, 0, 3, 0}
2397 #define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh, refunc) \
2398 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2400 v##_Tp##m2_t tmp = vundefined_##_T##m2();\
2401 tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a0.val); \
2402 tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a1.val); \
2403 unsigned mdata[] = VITL_##num; \
2404 vuint32m2_t mask = vle32_v_u32m2(mdata, 8); \
2405 tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, refunc(mask), num2); \
2406 b0.val = vget_v_##_T##m2_##_T##m1(tmp, 0); \
2407 b1.val = vget_v_##_T##m2_##_T##m1(tmp, 1); \
2409 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2411 v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2412 return v_##_Tpvec(b0);\
2414 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2416 v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
2417 v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
2418 v##_Tp##m1_t b1 = vundefined_##_T##m1(); \
2419 b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num); \
2420 a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num); \
2421 b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2422 return v_##_Tpvec(b1);\
2424 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2426 v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
2427 v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
2428 c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2429 b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num); \
2430 a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num); \
2431 d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2434 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
2435 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
2436 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
2437 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
2438 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2,)
2439 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2,)
2440 OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2,)
2441 OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64,
uint64, u64, 2, 4, b64, 1, vreinterpret_v_u32m2_u64m2)
2445 return v_uint8x16(vrgather_vv_u8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
2449 return v_int8x16(vrgather_vv_i8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
2454 return v_uint16x8(vrgather_vv_u16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
2459 return v_int16x8(vrgather_vv_i16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
2463 return v_uint32x4(vrgather_vv_u32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
2468 return v_int32x4(vrgather_vv_i32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
2472 {
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
2476 return v_uint64x2(vrgather_vv_u64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2481 return v_int64x2(vrgather_vv_i64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2486 return v_float64x2(vrgather_vv_f64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2489 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
2491 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2492 { return v_rotate_right<n>(a, b);}
2493 OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint8x16, u8, 0)
2494 OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int8x16, s8, 0)
2495 OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint16x8, u16, 1)
2496 OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int16x8, s16, 1)
2497 OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint32x4, u32, 2)
2498 OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int32x4, s32, 2)
2499 OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint64x2, u64, 3)
2500 OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int64x2, s64, 3)
2501 OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_float32x4, f32, 2)
2502 OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_float64x2, f64, 3)
2505 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix, vtype, _vtype, num, mvfunc) \
2506 template<int i> inline _Tp v_extract_n(_Tpvec v) { vtype tmp = vundefined_##_vtype(); return mvfunc(vslidedown_vx_##_vtype(tmp, v.val, i, num)); }
2508 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint8x16,
uchar, u8, vuint8m1_t, u8m1, 16, vmv_x_s_u8m1_u8)
2509 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int8x16,
schar, s8, vint8m1_t, i8m1, 16, vmv_x_s_i8m1_i8)
2510 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint16x8,
ushort, u16, vuint16m1_t, u16m1, 8, vmv_x_s_u16m1_u16)
2511 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int16x8,
short, s16, vint16m1_t, i16m1, 8, vmv_x_s_i16m1_i16)
2512 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint32x4,
uint, u32, vuint32m1_t, u32m1, 4, vmv_x_s_u32m1_u32)
2513 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int32x4,
int, s32, vint32m1_t, i32m1, 4, vmv_x_s_i32m1_i32)
2514 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint64x2,
uint64, u64, vuint64m1_t, u64m1, 2, vmv_x_s_u64m1_u64)
2515 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int64x2,
int64, s64, vint64m1_t, i64m1, 2, vmv_x_s_i64m1_i64)
2516 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_float32x4,
float, f32, vfloat32m1_t, f32m1, 4, vfmv_f_s_f32m1_f32)
2517 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_float64x2,
double, f64, vfloat64m1_t, f64m1, 2, vfmv_f_s_f64m1_f64)
2519 #define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
2520 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
2522 OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint8x16, u8, 16)
2523 OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int8x16, i8, 16)
2524 OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint16x8, u16, 8)
2525 OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int16x8, i16, 8)
2526 OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint32x4, u32, 4)
2527 OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int32x4, i32, 4)
2528 OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint64x2, u64, 2)
2529 OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int64x2, i64, 2)
2530 OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_float32x4, f32, 4)
2532 inline
void __builtin_riscv_fsrm(
int val)
2534 asm(
"csrw frm, %0\n\t"
2540 inline void barrier1(
void *arg) {
2541 __asm__ __volatile__(
"" : :
"r" (arg) :
"memory");
2546 __builtin_riscv_fsrm(0);
2547 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2549 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2550 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2551 __builtin_riscv_fsrm(0);
2556 __builtin_riscv_fsrm(2);
2557 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2559 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2560 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2561 __builtin_riscv_fsrm(0);
2567 __builtin_riscv_fsrm(3);
2568 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2570 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2571 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2572 __builtin_riscv_fsrm(0);
2578 __builtin_riscv_fsrm(1);
2579 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2581 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2582 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2583 __builtin_riscv_fsrm(0);
2589 __builtin_riscv_fsrm(0);
2590 vfloat64m2_t _val = vundefined_f64m2();
2591 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2593 _val = vset_v_f64m1_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
2595 vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
2596 __builtin_riscv_fsrm(0);
2601 __builtin_riscv_fsrm(0);
2602 vfloat64m2_t _val = vundefined_f64m2();
2603 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2604 _val = vset_v_f64m1_f64m2(_val, 1, b.val);
2606 vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
2607 __builtin_riscv_fsrm(0);
2612 __builtin_riscv_fsrm(2);
2613 vfloat64m2_t _val = vundefined_f64m2();
2614 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2615 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2616 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2618 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2619 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), aval, 4);
2620 __builtin_riscv_fsrm(0);
2626 __builtin_riscv_fsrm(3);
2627 vfloat64m2_t _val = vundefined_f64m2();
2628 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2629 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2630 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2632 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2633 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), aval, 4);
2634 __builtin_riscv_fsrm(0);
2640 __builtin_riscv_fsrm(1);
2641 vfloat64m2_t _val = vundefined_f64m2();
2642 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2643 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2644 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2646 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2647 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), aval, 4);
2648 __builtin_riscv_fsrm(0);
2652 #define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize) \
2653 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2655 intrin##2e##elemsize##_v_##_T##m1(&a.val, &b.val, ptr, num); \
2657 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2659 intrin##3e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num); \
2661 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2662 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2664 intrin##4e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num); \
2667 #define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize) \
2668 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2669 hal::StoreMode =hal::STORE_UNALIGNED) \
2671 intrin##2e##elemsize##_v_##_T##m1(ptr, a.val, b.val, num); \
2673 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2674 const v_##_Tpvec##x##num& c, hal::StoreMode =hal::STORE_UNALIGNED) \
2676 intrin##3e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, num); \
2678 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2679 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2680 hal::StoreMode =hal::STORE_UNALIGNED ) \
2682 intrin##4e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num); \
2685 #define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T, elemsize) \
2686 OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T, elemsize) \
2687 OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T, elemsize)
2690 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8,
schar, 16, vlseg, vsseg, i8, 8)
2691 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16,
short, 8, vlseg, vsseg, i16, 16)
2692 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32,
int, 4, vlseg, vsseg, i32, 32)
2694 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8,
unsigned char, 16, vlseg, vsseg, u8, 8)
2695 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16,
unsigned short, 8, vlseg, vsseg, u16, 16)
2696 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32,
unsigned int, 4, vlseg, vsseg, u32, 32)
2698 #define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T, _esize) \
2699 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2700 { vlseg2e##_esize##_v_##_T##m1(&a.val, &b.val, ptr, num);} \
2701 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2702 { vlseg3e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num);}\
2703 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2704 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2705 { vlseg4e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num);} \
2706 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2707 hal::StoreMode =hal::STORE_UNALIGNED) \
2708 { vsseg2e##_esize##_v_##_T##m1(ptr, a.val, b.val, num);} \
2709 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2710 const v_##_Tpvec##x##num& c, hal::StoreMode =hal::STORE_UNALIGNED) \
2711 { vsseg3e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, num);} \
2712 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2713 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2714 hal::StoreMode =hal::STORE_UNALIGNED ) \
2715 { vsseg4e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num);}
2717 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32,
float, 4, f32, 32)
2718 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64,
double, 2, f64, 64)
2720 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(
uint64,
unsigned long, 2, u64, 64)
2721 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(
int64,
long, 2, i64, 64)
2731 vfloat64m2_t _val = vundefined_f64m2();
2732 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2733 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2739 vfloat64m2_t _val = vundefined_f64m2();
2740 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2741 _val = vset_v_f64m1_f64m2(_val, 1, b.val);
2742 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 4);
2748 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2749 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2755 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2756 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2762 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2768 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2780 uint64 mdata[2] = {0x0705060403010200, 0x0F0D0E0C0B090A08};
2781 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2782 return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
2791 uint64 mdata[2] = {0x0703060205010400, 0x0F0B0E0A0D090C08};
2792 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2793 return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
2802 uint64 mdata[2] = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
2803 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2804 return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2809 uint64 mdata[2] = {0x0B0A030209080100, 0x0F0E07060D0C0504};
2810 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2811 return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2817 uint64 mdata[2] = {0x0B0A090803020100, 0x0F0E0D0C07060504};
2818 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2819 return v_int32x4(vreinterpret_v_i8m1_i32m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2825 uint64 mdata[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
2826 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2827 return v_int8x16(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vec.val), vreinterpret_v_u64m1_u8m1(m0), 16)));
2833 uint64 mdata[2] = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
2834 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2835 return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2851 vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
2852 vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2), 2);
2860 #if __riscv_v == 7000
2863 vfloat16m1_t v = vle16_v_f16m1((__fp16*)ptr, 4);
2864 vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
2870 vfloat32m2_t v32 = vundefined_f32m2();
2871 v32 = vset_v_f32m1_f32m2(v32, 0, v.val);
2872 vfloat16m1_t hv = vfncvt_f_f_w_f16m1(v32, 4);
2873 vse16_v_f16m1((__fp16*)ptr, hv, 4);
2878 vfloat16mf2_t v = vle16_v_f16mf2((__fp16*)ptr, 4);
2879 vfloat32m1_t v32 = vfwcvt_f_f_v_f32m1(v, 4);
2887 vfloat16mf2_t hv = vfncvt_f_f_w_f16mf2(v.val, 4);
2888 vse16_v_f16mf2((__fp16*)ptr, hv, 4);
2894 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
static bool operator!=(const Matx< _Tp, m, n > &a, const Matx< _Tp, m, n > &b)
static bool operator==(const Matx< _Tp, m, n > &a, const Matx< _Tp, m, n > &b)
const int * idx
Definition: core_c.h:668
int index
Definition: core_c.h:634
const CvSeq * seq
Definition: core_c.h:1548
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition: intrin_cpp.hpp:1020
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition: intrin_cpp.hpp:1033
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
CV_INLINE v_reg< _Tp, n > & operator/=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1451
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
OutputArray sum
Definition: imgproc.hpp:2882
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
static bool operator<(const FileNodeIterator &it1, const FileNodeIterator &it2)
Definition: persistence.hpp:1303
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437