7#ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
8#define OPENCV_HAL_INTRIN_RISCVV_HPP
12#include "opencv2/core/utility.hpp"
19CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
22#define CV_SIMD128_64F 1
26 typedef uchar lane_type;
34 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
35 val = (vuint8m1_t)vle8_v_u8m1((
unsigned char*)v, 16);
39 return vmv_x_s_u8m1_u8(val);
47 typedef schar lane_type;
51 explicit v_int8x16(vint8m1_t v) : val(v) {}
55 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56 val = (vint8m1_t)vle8_v_i8m1((
schar*)v, 16);
60 return vmv_x_s_i8m1_i8(val);
75 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
76 val = (vuint16m1_t)vle16_v_u16m1((
unsigned short*)v, 8);
80 return vmv_x_s_u16m1_u16(val);
88 typedef short lane_type;
92 explicit v_int16x8(vint16m1_t v) : val(v) {}
93 v_int16x8(
short v0,
short v1,
short v2,
short v3,
short v4,
short v5,
short v6,
short v7)
95 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96 val = (vint16m1_t)vle16_v_i16m1((
signed short*)v, 8);
100 return vmv_x_s_i16m1_i16(val);
108 typedef unsigned lane_type;
112 explicit v_uint32x4(vuint32m1_t v) : val(v) {}
113 v_uint32x4(
unsigned v0,
unsigned v1,
unsigned v2,
unsigned v3)
115 unsigned v[] = {v0, v1, v2, v3};
116 val = (vuint32m1_t)vle32_v_u32m1((
unsigned int*)v, 4);
118 unsigned get0()
const
120 return vmv_x_s_u32m1_u32(val);
128 typedef int lane_type;
132 explicit v_int32x4(vint32m1_t v) : val(v) {}
133 v_int32x4(
int v0,
int v1,
int v2,
int v3)
135 int v[] = {v0, v1, v2, v3};
136 val = (vint32m1_t)vle32_v_i32m1((
signed int*)v, 4);
140 return vmv_x_s_i32m1_i32(val);
147 typedef float lane_type;
152 v_float32x4(
float v0,
float v1,
float v2,
float v3)
154 float v[] = {v0, v1, v2, v3};
155 val = (vfloat32m1_t)vle32_v_f32m1((
float*)v, 4);
159 return vfmv_f_s_f32m1_f32(val);
170 explicit v_uint64x2(vuint64m1_t v) : val(v) {}
174 val = (vuint64m1_t)vle64_v_u64m1((
unsigned long*)v, 2);
178 return vmv_x_s_u64m1_u64(val);
185 typedef int64 lane_type;
189 explicit v_int64x2(vint64m1_t v) : val(v) {}
192 int64 v[] = {v0, v1};
193 val = (vint64m1_t)vle64_v_i64m1((
long*)v, 2);
197 return vmv_x_s_i64m1_i64(val);
204 typedef double lane_type;
211 double v[] = {v0, v1};
212 val = (vfloat64m1_t)vle64_v_f64m1((
double*)v, 2);
216 return vfmv_f_s_f64m1_f64(val);
249inline v_int16x8 v_reinterpret_as_s16(
const v_uint8x16& v) {
return v_int16x8(vreinterpret_v_u16m1_i16m1(vreinterpret_v_u8m1_u16m1(v.val))); }
251inline v_int32x4 v_reinterpret_as_s32(
const v_uint8x16& v) {
return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
253inline v_int64x2 v_reinterpret_as_s64(
const v_uint8x16& v) {
return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
269inline v_int8x16 v_reinterpret_as_s8(
const v_uint16x8& v) {
return v_int8x16(vreinterpret_v_i16m1_i8m1(vreinterpret_v_u16m1_i16m1(v.val))); }
273inline v_int32x4 v_reinterpret_as_s32(
const v_uint16x8& v) {
return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
275inline v_int64x2 v_reinterpret_as_s64(
const v_uint16x8& v) {
return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
283inline v_uint32x4 v_reinterpret_as_u32(
const v_int16x8& v) {
return v_uint32x4(vreinterpret_v_u16m1_u32m1(vreinterpret_v_i16m1_u16m1(v.val))); }
285inline v_uint64x2 v_reinterpret_as_u64(
const v_int16x8& v) {
return v_uint64x2(vreinterpret_v_u16m1_u64m1(vreinterpret_v_i16m1_u16m1(v.val))); }
291inline v_int8x16 v_reinterpret_as_s8(
const v_uint32x4& v) {
return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_u32m1_i32m1(v.val))); }
293inline v_int16x8 v_reinterpret_as_s16(
const v_uint32x4& v) {
return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_u32m1_i32m1(v.val))); }
297inline v_int64x2 v_reinterpret_as_s64(
const v_uint32x4& v) {
return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
303inline v_uint16x8 v_reinterpret_as_u16(
const v_int32x4& v) {
return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_i32m1_u32m1(v.val))); }
307inline v_uint64x2 v_reinterpret_as_u64(
const v_int32x4& v) {
return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_i32m1_u32m1(v.val))); }
313inline v_int8x16 v_reinterpret_as_s8(
const v_uint64x2& v) {
return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_u64m1_i64m1(v.val))); }
315inline v_int16x8 v_reinterpret_as_s16(
const v_uint64x2& v) {
return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_u64m1_i64m1(v.val))); }
317inline v_int32x4 v_reinterpret_as_s32(
const v_uint64x2& v) {
return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_u64m1_i64m1(v.val))); }
325inline v_uint16x8 v_reinterpret_as_u16(
const v_int64x2& v) {
return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_i64m1_u64m1(v.val))); }
327inline v_uint32x4 v_reinterpret_as_u32(
const v_int64x2& v) {
return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_i64m1_u64m1(v.val))); }
335inline v_int8x16 v_reinterpret_as_s8(
const v_float32x4& v) {
return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_f32m1_i32m1(v.val))); }
337inline v_int16x8 v_reinterpret_as_s16(
const v_float32x4& v) {
return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_f32m1_i32m1(v.val))); }
341inline v_int64x2 v_reinterpret_as_s64(
const v_float32x4& v) {
return v_int64x2(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val))); }
343inline v_float64x2 v_reinterpret_as_f64(
const v_float32x4& v) {
return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val)))); }
346inline v_int8x16 v_reinterpret_as_s8(
const v_float64x2& v) {
return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_f64m1_i64m1(v.val))); }
348inline v_int16x8 v_reinterpret_as_s16(
const v_float64x2& v) {
return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_f64m1_i64m1(v.val))); }
350inline v_int32x4 v_reinterpret_as_s32(
const v_float64x2& v) {
return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val))); }
353inline v_float32x4 v_reinterpret_as_f32(
const v_float64x2& v) {
return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val)))); }
356#define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
357inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); } \
358inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
360OPENCV_HAL_IMPL_RISCVV_INIT_SET(
uchar, uint8, u8, u8, 16)
361OPENCV_HAL_IMPL_RISCVV_INIT_SET(
char, int8, s8, i8, 16)
362OPENCV_HAL_IMPL_RISCVV_INIT_SET(
ushort, uint16, u16, u16, 8)
363OPENCV_HAL_IMPL_RISCVV_INIT_SET(
short, int16, s16, i16, 8)
364OPENCV_HAL_IMPL_RISCVV_INIT_SET(
unsigned int, uint32, u32, u32, 4)
365OPENCV_HAL_IMPL_RISCVV_INIT_SET(
int, int32, s32, i32, 4)
366OPENCV_HAL_IMPL_RISCVV_INIT_SET(
unsigned long,
uint64, u64, u64, 2)
367OPENCV_HAL_IMPL_RISCVV_INIT_SET(
long,
int64, s64, i64, 2)
375#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
376inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
378 return _Tpvec(intrin(a.val, b.val)); \
380inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
382 a.val = intrin(a.val, b.val); \
386#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
387inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
389 return _Tpvec(intrin(a.val, b.val, num)); \
391inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
393 a.val = intrin(a.val, b.val, num); \
397OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint8x16, vsaddu_vv_u8m1, 16)
398OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint8x16, vssubu_vv_u8m1, 16)
399OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int8x16, vsadd_vv_i8m1, 16)
400OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int8x16, vssub_vv_i8m1, 16)
401OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint16x8, vsaddu_vv_u16m1, 8)
402OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint16x8, vssubu_vv_u16m1, 8)
403OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int16x8, vsadd_vv_i16m1, 8)
404OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int16x8, vssub_vv_i16m1, 8)
405OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int32x4, vadd_vv_i32m1, 4)
406OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int32x4, vsub_vv_i32m1, 4)
407OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_int32x4, vmul_vv_i32m1, 4)
408OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint32x4, vadd_vv_u32m1, 4)
409OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint32x4, vsub_vv_u32m1, 4)
410OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_uint32x4, vmul_vv_u32m1, 4)
411OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_int64x2, vadd_vv_i64m1, 2)
412OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_int64x2, vsub_vv_i64m1, 2)
413OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_uint64x2, vadd_vv_u64m1, 2)
414OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_uint64x2, vsub_vv_u64m1, 2)
415OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_float32x4, vfadd_vv_f32m1, 4)
416OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_float32x4, vfsub_vv_f32m1, 4)
417OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_float32x4, vfmul_vv_f32m1, 4)
420 return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
424 a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
428OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+,
v_float64x2, vfadd_vv_f64m1, 2)
429OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-,
v_float64x2, vfsub_vv_f64m1, 2)
430OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*,
v_float64x2, vfmul_vv_f64m1, 2)
433 return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
437 a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
442#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
443inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
445 return _Tpvec(intrin(a.val, b.val)); \
448#define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
449inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
451 return _Tpvec(intrin(a.val, b.val, num)); \
453OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_min, vminu_vv_u8m1, 16)
454OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
455OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_min, vmin_vv_i8m1, 16)
456OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_max, vmax_vv_i8m1, 16)
457OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_min, vminu_vv_u16m1, 8)
458OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
459OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_min, vmin_vv_i16m1, 8)
460OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_max, vmax_vv_i16m1, 8)
461OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint32x4, v_min, vminu_vv_u32m1, 4)
462OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
463OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int32x4, v_min, vmin_vv_i32m1, 4)
464OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int32x4, v_max, vmax_vv_i32m1, 4)
465OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float32x4, v_min, vfmin_vv_f32m1, 4)
466OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float32x4, v_max, vfmax_vv_f32m1, 4)
467OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float64x2, v_min, vfmin_vv_f64m1, 2)
468OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_float64x2, v_max, vfmax_vv_f64m1, 2)
477 return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(
x.val, 4), 1, 4));
482 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
488 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
493 return v_float32x4(vfmadd_vv_f32m1(a.val, b.val, c.val, 4));
498 return v_int32x4(vmadd_vv_i32m1(a.val, b.val, c.val, 4));
503 return v_fma(a, b, c);
508 return v_fma(a, b, c);
515 vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);
516 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);
517 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);
518 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 3, 4), m3.val, 4);
526 vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);
527 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);
528 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);
529 res = vfadd_vv_f32m1(res, a.val, 4);
540 return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(
x.val, 2), 1, 2));
545 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
551 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
556 return v_float64x2(vfmadd_vv_f64m1(a.val, b.val, c.val, 2));
561 return v_fma(a, b, c);
564#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
565 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
566 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
567 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
568 inline _Tpvec operator ~ (const _Tpvec & a) \
570 return _Tpvec(vnot_v_##suffix(a.val, num)); \
573OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint8x16, u8m1, 16)
574OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint16x8, u16m1, 8)
575OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint32x4, u32m1, 4)
576OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_uint64x2, u64m1, 2)
577OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int8x16, i8m1, 16)
578OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int16x8, i16m1, 8)
579OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int32x4, i32m1, 4)
580OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(
v_int64x2, i64m1, 2)
582#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
583inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
585 return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
587inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
589 a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
593OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
594OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
595OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
599 return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
602#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
603inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
605 return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
607inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
609 a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
613OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
614OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
615OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
619 return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
623 return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
627 return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
640 vbool32_t
mask=vmslt_vx_i32m1_b32(
x.val, 0, 4);
641 return v_uint32x4(vreinterpret_v_i32m1_u32m1(vrsub_vx_i32m1_m(
mask,
x.val,
x.val, 0, 4)));
646 vbool16_t
mask=vmslt_vx_i16m1_b16(
x.val, 0, 8);
647 return v_uint16x8(vreinterpret_v_i16m1_u16m1(vrsub_vx_i16m1_m(
mask,
x.val,
x.val, 0, 8)));
652 vbool8_t
mask=vmslt_vx_i8m1_b8(
x.val, 0, 16);
653 return v_uint8x16(vreinterpret_v_i8m1_u8m1(vrsub_vx_i8m1_m(
mask,
x.val,
x.val, 0, 16)));
668 vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
674 vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
678#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
679inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){ \
680 vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num); \
681 vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num); \
682 return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
685OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
686OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
687OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
691 vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
692 vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
693 return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
696 vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
697 vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
698 return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
701#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
702inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){ \
703 vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
704 vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
705 return v_uint##_Tpvec(vreinterpret_v_i##_Tpv##_u##_Tpv(vsub_vv_i##_Tpv(max, min, num))); \
708OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
709OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
710OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
716 vint16m2_t res = vundefined_i16m2();
717 res = vwmul_vv_i16m2(a.val, b.val, 16);
718 c.val = vget_v_i16m2_i16m1(res, 0);
719 d.val = vget_v_i16m2_i16m1(res, 1);
725 vuint16m2_t res = vundefined_u16m2();
726 res = vwmulu_vv_u16m2(a.val, b.val, 16);
727 c.val = vget_v_u16m2_u16m1(res, 0);
728 d.val = vget_v_u16m2_u16m1(res, 1);
734 vint32m2_t res = vundefined_i32m2();
735 res = vwmul_vv_i32m2(a.val, b.val, 8);
736 c.val = vget_v_i32m2_i32m1(res, 0);
737 d.val = vget_v_i32m2_i32m1(res, 1);
743 vuint32m2_t res = vundefined_u32m2();
744 res = vwmulu_vv_u32m2(a.val, b.val, 8);
745 c.val = vget_v_u32m2_u32m1(res, 0);
746 d.val = vget_v_u32m2_u32m1(res, 1);
752 vint64m2_t res = vundefined_i64m2();
753 res = vwmul_vv_i64m2(a.val, b.val, 4);
754 c.val = vget_v_i64m2_i64m1(res, 0);
755 d.val = vget_v_i64m2_i64m1(res, 1);
761 vuint64m2_t res = vundefined_u64m2();
762 res = vwmulu_vv_u64m2(a.val, b.val, 4);
763 c.val = vget_v_u64m2_u64m1(res, 0);
764 d.val = vget_v_u64m2_u64m1(res, 1);
767OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
768OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
769OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
770OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
771OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
772OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
773OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
774OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
775OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
776OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
777OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
778OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(
v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
783 vuint32m2_t vindex = vundefined_u32m2();
784 vuint32m1_t vindex0 = vid_v_u32m1(4);
785 vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
786 vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
787 vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
788 vint32m2_t res = vundefined_i32m2();
789 res = vwmul_vv_i32m2(a.val, b.val, 8);
790 res = vrgather_vv_i32m2(res, vindex, 8);
791 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0), vget_v_i32m2_i32m1(res, 1), 4));
795 vuint32m2_t vindex = vundefined_u32m2();
796 vuint32m1_t vindex0 = vid_v_u32m1(4);
797 vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
798 vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
799 vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
800 vint32m2_t res = vundefined_i32m2();
801 res = vwmul_vv_i32m2(a.val, b.val, 8);
802 res = vrgather_vv_i32m2(res, vindex, 8);
803 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0),vget_v_i32m2_i32m1(res, 1), 4), c.val, 4));
809 vuint64m2_t vindex = vundefined_u64m2();
810 vuint64m1_t vindex0 = vid_v_u64m1(2);
811 vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
812 vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
813 vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
814 vint64m2_t res = vundefined_i64m2();
815 res = vwmul_vv_i64m2(a.val, b.val, 4);
816 res = vrgather_vv_i64m2(res, vindex, 4);
817 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2));
821 vuint64m2_t vindex = vundefined_u64m2();
822 vuint64m1_t vindex0 = vid_v_u64m1(2);
823 vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
824 vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
825 vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
826 vint64m2_t res = vundefined_i64m2();
827 res = vwmul_vv_i64m2(a.val, b.val, 4);
828 res = vrgather_vv_i64m2(res, vindex, 4);
829 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2), c.val, 2));
835 vuint32m4_t vindex32 = vundefined_u32m4();
836 vuint32m1_t vindex0 = vid_v_u32m1(4);
837 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
838 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
839 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
840 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
841 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
842 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
843 vuint16m2_t v1 = vundefined_u16m2();
844 vuint32m2_t v2 = vundefined_u32m2();
845 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
846 v1 = vrgather_vv_u16m2(v1, vindex, 16);
847 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
848 return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
854 vuint32m4_t vindex32 = vundefined_u32m4();
855 vuint32m1_t vindex0 = vid_v_u32m1(4);
856 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
857 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
858 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
859 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
860 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
861 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
862 vuint16m2_t v1 = vundefined_u16m2();
863 vuint32m2_t v2 = vundefined_u32m2();
864 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
865 v1 = vrgather_vv_u16m2(v1, vindex, 16);
866 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
867 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
872 vuint32m4_t vindex32 = vundefined_u32m4();
873 vuint32m1_t vindex0 = vid_v_u32m1(4);
874 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
875 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
876 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
877 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
878 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
879 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
880 vint16m2_t v1 = vundefined_i16m2();
881 vint32m2_t v2 = vundefined_i32m2();
882 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
883 v1 = vrgather_vv_i16m2(v1, vindex, 16);
884 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
885 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
891 vuint32m4_t vindex32 = vundefined_u32m4();
892 vuint32m1_t vindex0 = vid_v_u32m1(4);
893 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
894 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
895 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
896 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
897 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
898 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
899 vint16m2_t v1 = vundefined_i16m2();
900 vint32m2_t v2 = vundefined_i32m2();
901 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
902 v1 = vrgather_vv_i16m2(v1, vindex, 16);
903 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
904 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
909 vuint64m4_t vindex64 = vundefined_u64m4();
910 vuint64m1_t vindex0 = vid_v_u64m1(2);
911 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
912 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
913 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
914 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
915 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
916 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
917 vuint32m2_t v1 = vundefined_u32m2();
918 vuint64m2_t v2 = vundefined_u64m2();
919 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
920 v1 = vrgather_vv_u32m2(v1, vindex, 8);
921 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
922 return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
928 vuint64m4_t vindex64 = vundefined_u64m4();
929 vuint64m1_t vindex0 = vid_v_u64m1(2);
930 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
931 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
932 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
933 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
934 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
935 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
936 vuint32m2_t v1 = vundefined_u32m2();
937 vuint64m2_t v2 = vundefined_u64m2();
938 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
939 v1 = vrgather_vv_u32m2(v1, vindex, 8);
940 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
941 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
946 vuint64m4_t vindex64 = vundefined_u64m4();
947 vuint64m1_t vindex0 = vid_v_u64m1(2);
948 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
949 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
950 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
951 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
952 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
953 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
954 vint32m2_t v1 = vundefined_i32m2();
955 vint64m2_t v2 = vundefined_i64m2();
956 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
957 v1 = vrgather_vv_i32m2(v1, vindex, 8);
958 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
959 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
965 vuint64m4_t vindex64 = vundefined_u64m4();
966 vuint64m1_t vindex0 = vid_v_u64m1(2);
967 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
968 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
969 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
970 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
971 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
972 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
973 vint32m2_t v1 = vundefined_i32m2();
974 vint64m2_t v2 = vundefined_i64m2();
975 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
976 v1 = vrgather_vv_i32m2(v1, vindex, 8);
977 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
978 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
985 vint32m2_t v1 = vundefined_i32m2();
986 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
987 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4));
992 vint32m2_t v1 = vundefined_i32m2();
993 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
994 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4), c.val, 4));
1000 vint64m2_t v1 = vundefined_i64m2();
1001 v1 = vwmul_vv_i64m2(a.val, b.val, 4);
1002 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2));
1006 vint64m2_t v1 = vundefined_i64m2();
1007 v1 = vwmul_vv_i64m2(a.val, b.val, 8);
1008 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 4), c.val, 4));
1014 vuint16m2_t v1 = vundefined_u16m2();
1015 vuint32m2_t v2 = vundefined_u32m2();
1016 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
1017 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
1018 return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
1023 vuint16m2_t v1 = vundefined_u16m2();
1024 vuint32m2_t v2 = vundefined_u32m2();
1025 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
1026 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
1027 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
1032 vint16m2_t v1 = vundefined_i16m2();
1033 vint32m2_t v2 = vundefined_i32m2();
1034 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
1035 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
1036 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
1040 vint16m2_t v1 = vundefined_i16m2();
1041 vint32m2_t v2 = vundefined_i32m2();
1042 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
1043 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
1044 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
1050 vuint32m2_t v1 = vundefined_u32m2();
1051 vuint64m2_t v2 = vundefined_u64m2();
1052 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
1053 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
1054 return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
1058 vuint32m2_t v1 = vundefined_u32m2();
1059 vuint64m2_t v2 = vundefined_u64m2();
1060 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
1061 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
1062 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
1067 vint32m2_t v1 = vundefined_i32m2();
1068 vint64m2_t v2 = vundefined_i64m2();
1069 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
1070 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
1071 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
1075 vint32m2_t v1 = vundefined_i32m2();
1076 vint64m2_t v2 = vundefined_i64m2();
1077 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
1078 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
1079 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
1083#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
1084inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
1086 v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
1087 val = intrin(val, a.val, val, num); \
1088 return vmv_x_s_##len##m1_##len(val); \
1092#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num, scalerfunc) \
1093inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
1095 v##_Tpvec##m1_t val = vundefined_##_Tpvec2##m1(); \
1096 val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num); \
1097 return scalerfunc(val); \
1099OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16,
int,
sum, vwredsum_vs_i8m1_i16m1, 16)
1100OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32,
int,
sum, vwredsum_vs_i16m1_i32m1, 8)
1101OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32,
int64, i64,
int,
sum, vwredsum_vs_i32m1_i64m1, 4)
1102OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16,
unsigned,
sum, vwredsumu_vs_u8m1_u16m1, 16)
1103OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32,
unsigned,
sum, vwredsumu_vs_u16m1_u32m1, 8)
1104OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32,
uint64, u64,
unsigned,
sum, vwredsumu_vs_u32m1_u64m1, 4)
1107 vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
1108 val = vfredosum_vs_f32m1_f32m1(val, a.val, val, 4); \
1109 return vfmv_f_s_f32m1_f32(val); \
1113 vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
1114 val = vfredosum_vs_f64m1_f64m1(val, a.val, val, 2); \
1115 return vfmv_f_s_f64m1_f64(val); \
1118{ vuint64m1_t res = vundefined_u64m1();
return vmv_x_s_u64m1_u64(vredsum_vs_u64m1_u64m1(res, a.val, vmv_v_x_u64m1(0, 2), 2)); }
1121{ vint64m1_t res = vundefined_i64m1();
return vmv_x_s_i64m1_i64(vredsum_vs_i64m1_i64m1(res, a.val, vmv_v_x_i64m1(0, 2), 2)); }
1123#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func) \
1124OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16, vmv_x_s_i8m1_i8) \
1125OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8, vmv_x_s_i16m1_i16) \
1126OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4, vmv_x_s_i32m1_i32) \
1127OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2, vmv_x_s_i64m1_i64) \
1128OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16, vmv_x_s_u8m1_u8) \
1129OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8, vmv_x_s_u16m1_u16) \
1130OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4, vmv_x_s_u32m1_u32) \
1131OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4, vfmv_f_s_f32m1_f32)
1132OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
1133OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
1138 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
1139 vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
1140 vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
1141 vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
1142 a0 = vfredosum_vs_f32m1_f32m1(a0, a.val, a0, 4);
1143 b0 = vfredosum_vs_f32m1_f32m1(b0, b.val, b0, 4);
1144 c0 = vfredosum_vs_f32m1_f32m1(c0, c.val, c0, 4);
1145 d0 = vfredosum_vs_f32m1_f32m1(d0, d.val, d0, 4);
1147 res = vslideup_vx_f32m1(a0, b0, 1, 4);
1148 res = vslideup_vx_f32m1(res, c0, 2, 4);
1149 res = vslideup_vx_f32m1(res, d0, 3, 4);
1155 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
1156 vfloat32m1_t
x = vfsub_vv_f32m1(a.val, b.val, 4);
1157 vbool32_t
mask=vmflt_vf_f32m1_b32(
x, 0, 4);
1158 vfloat32m1_t val = vfrsub_vf_f32m1_m(
mask,
x,
x, 0, 4);
1159 a0 = vfredosum_vs_f32m1_f32m1(a0, val, a0, 4);
1160 return vfmv_f_s_f32m1_f32(a0);
1163#define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
1164inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){ \
1165 _Tpvec2 x = v_absdiff(a, b); \
1166 return v_reduce_sum(x); \
1176#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
1177inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1179 vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
1180 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1182inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1184 vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
1185 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1187inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1189 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
1190 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1192inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1194 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
1195 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1197inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1199 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
1200 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1202inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1204 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
1205 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1208OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int8x16, i8m1, 8, 16, _vv_)
1209OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int16x8, i16m1, 16, 8, _vv_)
1210OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int32x4, i32m1, 32, 4, _vv_)
1211OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_int64x2, i64m1, 64, 2, _vv_)
1212OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint8x16, u8m1, 8, 16, u_vv_)
1213OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint16x8, u16m1, 16, 8, u_vv_)
1214OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint32x4, u32m1, 32, 4, u_vv_)
1215OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(
v_uint64x2, u64m1, 64, 2, u_vv_)
1220 vbool32_t
mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
1221 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1222 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1226 vbool32_t
mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
1227 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1228 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1232 vbool32_t
mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
1233 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1234 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1238 vbool32_t
mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
1239 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1240 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1244 vbool32_t
mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
1245 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1246 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1250 vbool32_t
mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
1251 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1252 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1256 vbool32_t
mask = vmfeq_vv_f32m1_b32(a.val, a.val, 4);
1257 vint32m1_t res = vmerge_vxm_i32m1(
mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1258 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1264 vbool64_t
mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
1265 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1266 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1270 vbool64_t
mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
1271 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1272 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1276 vbool64_t
mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
1277 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1278 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1282 vbool64_t
mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
1283 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1284 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1288 vbool64_t
mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
1289 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1290 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1294 vbool64_t
mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
1295 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1296 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1300 vbool64_t
mask = vmfeq_vv_f64m1_b64(a.val, a.val, 2);
1301 vint64m1_t res = vmerge_vxm_i64m1(
mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1302 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1304#define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
1305inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
1306 const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
1307 v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
1308 v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
1310 vuint32m4_t vindex = vundefined_u32m4(); \
1311 vuint32m1_t vindex0 = vid_v_u32m1(4); \
1312 vindex0 = vsll_vx_u32m1(vindex0, 2, 4); \
1313 vindex = vset_v_u32m1_u32m4(vindex, 0, vindex0); \
1314 vindex = vset_v_u32m1_u32m4(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); \
1315 vindex = vset_v_u32m1_u32m4(vindex, 2, vadd_vx_u32m1(vindex0, 2, 4)); \
1316 vindex = vset_v_u32m1_u32m4(vindex, 3, vadd_vx_u32m1(vindex0, 3, 4)); \
1317 v##_Tp##32m4_t val = vundefined_##_T##m4(); \
1318 val = vset_v_##_T##m1_##_T##m4(val, 0, a0.val); \
1319 val = vset_v_##_T##m1_##_T##m4(val, 1, a1.val); \
1320 val = vset_v_##_T##m1_##_T##m4(val, 2, a2.val); \
1321 val = vset_v_##_T##m1_##_T##m4(val, 3, a3.val); \
1322 val = vrgather_vv_##_T##m4(val, vindex, 16); \
1323 b0.val = vget_v_##_T##m4_##_T##m1(val, 0); \
1324 b1.val = vget_v_##_T##m4_##_T##m1(val, 1); \
1325 b2.val = vget_v_##_T##m4_##_T##m1(val, 2); \
1326 b3.val = vget_v_##_T##m4_##_T##m1(val, 3); \
1328OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(
uint, u32)
1329OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(
int, i32)
1330OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(
float, f32)
1333#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
1334inline _Tpvec operator << (const _Tpvec& a, int n) \
1335{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
1336template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1337{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
1339#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
1340inline _Tpvec operator >> (const _Tpvec& a, int n) \
1341{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
1342template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1343{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
1344template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1345{ return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
1348#define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
1349OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
1350OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
1352OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
1353OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
1354OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
1355OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(
uint64, u64, 2, srl)
1356OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
1357OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
1358OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
1359OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(
int64, i64, 2, sra)
1362#define VUP4(n) {0, 1, 2, 3}
1363#define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
1364#define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
1365#define VUP2(n) {0, 1}
1367#define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
1368template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1370 suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
1371 tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
1372 return _Tpvec(tmp);\
1374template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1376 suffix##m1_t res = vundefined_##_T##m1(); \
1377 return _Tpvec(vslidedown_vx_##_T##m1(res, a.val, n, num));\
1379template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1381template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1383 suffix##m2_t tmp = vundefined_##_T##m2(); \
1384 suffix##m2_t res = vundefined_##_T##m2(); \
1385 tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a.val); \
1386 tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, b.val); \
1387 res = vslidedown_vx_##_T##m2(res, tmp, n, num2);\
1388 return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 0));\
1390template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1392 suffix##m2_t tmp = vundefined_##_T##m2(); \
1393 suffix##m2_t res = vundefined_##_T##m2(); \
1394 tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, b.val); \
1395 tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a.val); \
1396 res = vslideup_vx_##_T##m2(res, tmp, n, num2);\
1397 return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 1));\
1399template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1401 CV_UNUSED(b); return a; \
1404OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
1405OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
1406OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
1407OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
1408OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
1409OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
1410OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
1411OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
1412OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
1413OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(
v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
1416#define vreinterpret_v_i8m1_i8m1
1417#define vreinterpret_v_u8m1_u8m1
1418#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize, ldst_len, ldst_type) \
1419inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1421 _Tp2##_t res = vundefined_##len(); \
1422 _Tp2##_t res1 = vundefined_##len(); \
1423 res = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr0, 8)); \
1424 res1 = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr1, 8)); \
1425 res = vslideup_vx_##len(res, res1, hnum, num); \
1426 return _Tpvec(res); } \
1427inline _Tpvec v_load_low(const _Tp* ptr) \
1428{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 8))); }\
1429inline _Tpvec v_load_aligned(const _Tp* ptr) \
1430{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
1431inline _Tpvec v_load(const _Tp* ptr) \
1432{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1433inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1434{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
1435inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1437 _Tp2##_t a0 = vundefined_##len(); \
1438 a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1439 vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
1440inline void v_store(_Tp* ptr, const _Tpvec& a) \
1441{ vse##elemsize##_v_##len(ptr, a.val, num); } \
1442inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1443{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
1444inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1445{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
1446inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
1447{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); }
1450OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int8x16,
schar, vint8m1, i8m1, 8, 16, 8, i8m1,
schar)
1452OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int16x8,
short, vint16m1, i16m1, 4, 8, 16, i8m1,
schar)
1453OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint32x4,
unsigned, vuint32m1, u32m1, 2, 4, 32, u8m1,
uchar)
1454OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int32x4,
int, vint32m1, i32m1, 2, 4, 32, i8m1,
schar)
1455OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint64x2,
unsigned long, vuint64m1, u64m1, 1, 2, 64, u8m1,
uchar)
1456OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int64x2,
long, vint64m1, i64m1, 1, 2, 64, i8m1,
schar)
1458#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
1459inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1461 _Tp2##_t res = vundefined_##len(); \
1462 _Tp2##_t res1 = vundefined_##len(); \
1463 res = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr0, 8))); \
1464 res1 = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr1, 8))); \
1465 res = vslideup_vx_##len(res, res1, hnum, num); \
1466 return _Tpvec(res); } \
1467inline _Tpvec v_load_low(const _Tp* ptr) \
1468{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 8)))); }\
1469inline _Tpvec v_load_aligned(const _Tp* ptr) \
1470{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
1471inline _Tpvec v_load(const _Tp* ptr) \
1472{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1473inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1474{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
1475inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1477 _Tp2##_t a0 = vundefined_##len(); \
1478 a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1479 vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
1480inline void v_store(_Tp* ptr, const _Tpvec& a) \
1481{ vse##elemsize##_v_##len(ptr, a.val, num); } \
1482inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1483{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
1484inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1485{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
1486inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
1487{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); }
1488OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(
v_float32x4,
float, vfloat32m1, f32m1, 2, 4, 32)
1489OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(
v_float64x2,
double, vfloat64m1, f64m1, 1, 2, 64)
1493#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
1494inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1496 _Tp2##_t res, res1; \
1497 res = vle##elemsize##_v_##len(ptr0, hnum); \
1498 res1 = vle##elemsize##_v_##len(ptr1, hnum); \
1499 res = vslideup_vx_##len(res, res1, hnum, num); \
1500 return _Tpvec(res); } \
1501inline _Tpvec v_load_low(const _Tp* ptr) \
1502{ return _Tpvec(vle##elemsize##_v_##len(ptr, hnum)); }\
1503inline _Tpvec v_load_aligned(const _Tp* ptr) \
1504{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1505inline _Tpvec v_load(const _Tp* ptr) \
1506{ return _Tpvec((_Tp2##_t)vle##elemsize##_v_##len((const _Tp *)ptr, num)); } \
1507inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1508{ vse##elemsize##_v_##len(ptr, a.val, hnum);}\
1509inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1512 a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1513 vse##elemsize##_v_##len(ptr, a0, hnum);}\
1514inline void v_store(_Tp* ptr, const _Tpvec& a) \
1515{ vse##elemsize##_v_##len(ptr, a.val, num); } \
1516inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1517{ vse##elemsize##_v_##len(ptr, a.val, num); } \
1518inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1519{ vse##elemsize##_v_##len(ptr, a.val, num); } \
1520inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \
1521{ vse##elemsize##_v_##len(ptr, a.val, num); }
1523OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint8x16,
uchar, vuint8m1, u8m1, 8, 16, 8)
1524OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int8x16,
schar, vint8m1, i8m1, 8, 16, 8)
1525OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint16x8,
ushort, vuint16m1, u16m1, 4, 8, 16)
1526OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int16x8,
short, vint16m1, i16m1, 4, 8, 16)
1527OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint32x4,
unsigned, vuint32m1, u32m1, 2, 4, 32)
1528OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int32x4,
int, vint32m1, i32m1, 2, 4, 32)
1529OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_uint64x2,
unsigned long, vuint64m1, u64m1, 1, 2, 64)
1530OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_int64x2,
long, vint64m1, i64m1, 1, 2, 64)
1531OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_float32x4,
float, vfloat32m1, f32m1, 2, 4, 32)
1532OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(
v_float64x2,
double, vfloat64m1, f64m1, 1, 2, 64)
1560 return v_int8x16(vle8_v_i8m1(elems, 16));
1562#if __riscv_v == 7000
1563 return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((
const int *)tab, vle32_v_u32m4((
unsigned int *)
idx, 16), 16), 0, 16), 0, 16));
1565 return v_int8x16(vloxei32_v_i8m1(tab, vle32_v_u32m4((
unsigned int *)
idx, 16), 16));
1591 return v_int8x16(vle8_v_i8m1(elems, 16));
1594 vuint32m4_t vidx = vle32_v_u32m4((
unsigned int *)
idx, 8);
1595 seq = vid_v_u32m4(16);
1597 vidx = vrgather_vv_u32m4(vidx,
index, 16);
1598 index = vadd_vv_u32m4(vand_vx_u32m4(
seq, 1, 16), vidx, 16);
1599#if __riscv_v == 7000
1600 return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((
const int *)tab,
index, 16), 0, 16), 0, 16));
1628 return v_int8x16(vle8_v_i8m1(elems, 16));
1631 vuint32m4_t vidx = vle32_v_u32m4((
unsigned int *)
idx, 4);
1632 seq = vid_v_u32m4(16);
1634 vidx = vrgather_vv_u32m4(vidx,
index, 16);
1635 seq = vset_v_u32m1_u32m4(
seq, 1, vget_v_u32m4_u32m1(
seq, 0));
1636 seq = vset_v_u32m1_u32m4(
seq, 2, vget_v_u32m4_u32m1(
seq, 0));
1637 seq = vset_v_u32m1_u32m4(
seq, 3, vget_v_u32m4_u32m1(
seq, 0));
1638 index = vadd_vv_u32m4(
seq, vidx, 16);
1639#if __riscv_v == 7000
1640 return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((
const int *)tab,
index, 16), 0, 16), 0, 16));
1665 return v_int16x8(vle16_v_i16m1(elems, 8));
1667#if __riscv_v == 7000
1668 return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((
const int *)tab, vsll_vx_u32m2(vle32_v_u32m2((
unsigned int *)
idx, 8), 1, 8), 8), 0, 8));
1670 return v_int16x8(vloxei32_v_i16m1(tab, vsll_vx_u32m2(vle32_v_u32m2((
unsigned int *)
idx, 8), 1, 8), 8));
1688 return v_int16x8(vle16_v_i16m1(elems, 8));
1691 vuint32m2_t vidx = vle32_v_u32m2((
unsigned int *)
idx, 4);
1692 seq = vid_v_u32m2(8);
1694 vidx = vrgather_vv_u32m2(vidx,
index, 8);
1695 index = vsll_vx_u32m2(vadd_vv_u32m2(vand_vx_u32m2(
seq, 1, 8), vidx, 8), 1, 8);
1696#if __riscv_v == 7000
1697 return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((
const int *)tab,
index, 8), 0, 8));
1717 return v_int16x8(vle16_v_i16m1(elems, 8));
1720 vuint32m2_t vidx = vle32_v_u32m2((
unsigned int *)
idx, 2);
1721 seq = vid_v_u32m2(8);
1723 vidx = vrgather_vv_u32m2(vidx,
index, 8);
1724 seq = vset_v_u32m1_u32m2(
seq, 1, vget_v_u32m2_u32m1(
seq, 0));
1725 index = vsll_vx_u32m2(vadd_vv_u32m2(
seq, vidx, 8), 1, 8);
1726#if __riscv_v == 7000
1727 return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((
const int *)tab,
index, 8), 0, 8));
1747 return v_int32x4(vle32_v_i32m1(elems, 4));
1749 return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((
unsigned int *)
idx, 4), 2, 4), 4));
1762 return v_int32x4(vle32_v_i32m1(elems, 4));
1765 vuint32m1_t vidx = vle32_v_u32m1((
unsigned int *)
idx, 2);
1766 seq = vid_v_u32m1(4);
1768 vidx = vrgather_vv_u32m1(vidx,
index, 4);
1769 index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(
seq, 1, 4), vidx, 4), 2, 4);
1784 return v_int64x2(vloxei64_v_i64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)
idx, 2), 0, 2), 0), 3, 2), 2));
1794 return v_uint64x2(vloxei64_v_u64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)
idx, 2), 0, 2), 0), 3, 2), 2));
1813 return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((
unsigned int *)
idx, 4), 2, 4), 4));
1829 vuint32m1_t vidx = vle32_v_u32m1((
unsigned int *)
idx, 2);
1830 seq = vid_v_u32m1(4);
1832 vidx = vrgather_vv_u32m1(vidx,
index, 4);
1833 index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(
seq, 1, 4), vidx, 4), 2, 4);
1844 return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)
idx, 2), 0, 2), 0), 3, 2), 2));
1860 return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1872 return v_uint32x4(vloxei32_v_u32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1884 return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1889 return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vreinterpret_v_i64m1_u64m1(vget_v_i64m2_i64m1(vwadd_vx_i64m2(idxvec.val, 0, 2), 0)), 3, 2), 2));
1893 vint32m1_t
index = vmul_vx_i32m1(idxvec.val, 4, 4);
1898 vloxseg2ei32_v_f32m1(&
x.val, &
y.val, tab, vreinterpret_v_i32m1_u32m1(
index), 4);
1910#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type, elemsize) \
1911inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1913 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1914 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1915 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val); \
1916 return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
1918template<int n> inline \
1919v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1921 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1922 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1923 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val); \
1924 return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
1926inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1928 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1929 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1930 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1931 asm("" ::: "memory"); \
1932 vse##elemsize##_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
1934template<int n> inline \
1935void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1937 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1938 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1939 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1940 vse##elemsize##_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
1942OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_wx, vnclip_wx,
signed char, 8)
1943OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_wx, vnclip_wx,
signed short, 16)
1944OPENCV_HAL_IMPL_RISCVV_PACKS(int32,
int64, i64, 2, i32, 4, vnclip_wx, vnsra_wx,
int, 32)
1945OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_wx, vnclipu_wx,
unsigned char, 8)
1946OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_wx, vnclipu_wx,
unsigned short, 16)
1947OPENCV_HAL_IMPL_RISCVV_PACKS(uint32,
uint64, u64, 2, u32, 4, vnclipu_wx, vnsrl_wx,
unsigned int, 32)
1952 vuint16m2_t tmp = vundefined_u16m2(); \
1953 tmp = vset_v_u16m1_u16m2(tmp, 0, a.val); \
1954 tmp = vset_v_u16m1_u16m2(tmp, 1, b.val); \
1955 return v_uint8x16(vnsrl_wx_u8m1(tmp, 0, 16));
1961 vuint32m4_t vabcd = vundefined_u32m4(); \
1962 vuint16m2_t v16 = vundefined_u16m2(); \
1963 vabcd = vset_v_u32m1_u32m4(vabcd, 0, a.val); \
1964 vabcd = vset_v_u32m1_u32m4(vabcd, 1, b.val); \
1965 vabcd = vset_v_u32m1_u32m4(vabcd, 2, c.val); \
1966 vabcd = vset_v_u32m1_u32m4(vabcd, 3, d.val); \
1967 v16 = vnsrl_wx_u16m2(vabcd, 0, 16);
1968 return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
1975 vuint64m8_t v64 = vundefined_u64m8(); \
1976 vuint32m4_t v32 = vundefined_u32m4(); \
1977 vuint16m2_t v16 = vundefined_u16m2(); \
1978 v64 = vset_v_u64m1_u64m8(v64, 0, a.val); \
1979 v64 = vset_v_u64m1_u64m8(v64, 1, b.val); \
1980 v64 = vset_v_u64m1_u64m8(v64, 2, c.val); \
1981 v64 = vset_v_u64m1_u64m8(v64, 3, d.val); \
1982 v64 = vset_v_u64m1_u64m8(v64, 4, e.val); \
1983 v64 = vset_v_u64m1_u64m8(v64, 5, f.val); \
1984 v64 = vset_v_u64m1_u64m8(v64, 6, g.val); \
1985 v64 = vset_v_u64m1_u64m8(v64, 7, h.val); \
1986 v32 = vnsrl_wx_u32m4(v64, 0, 16);
1987 v16 = vnsrl_wx_u16m2(v32, 0, 16);
1988 return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
2000#define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
2001inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
2003 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2004 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2005 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val); \
2006 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2007 return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1)); \
2009inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
2011 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2012 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2013 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2014 return vse##tp1##_v_u##tp1##m1(ptr, vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1), num2); \
2016template<int n> inline \
2017v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
2019 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2020 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2021 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val); \
2022 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2023 return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), n, num1)); \
2025template<int n> inline \
2026void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
2028 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2029 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2030 vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2031 vuint##tp1##m1_t val = vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val_), n, num1); \
2032 return vse##tp1##_v_u##tp1##m1(ptr, val, num2);\
2034OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8,
unsigned char )
2035OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4,
unsigned short)
2039#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt) \
2040 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
2042 auto res = mul(a.val, b.val, num); \
2043 return _Tpvec(cvt(res, 0, num)); \
2045 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
2046 { a = a * b; return a; }
2048OPENCV_HAL_IMPL_RISCVV_MUL_SAT(
v_int8x16, 16, vwmul_vv_i16m2, vnclip_wx_i8m1)
2049OPENCV_HAL_IMPL_RISCVV_MUL_SAT(
v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
2050OPENCV_HAL_IMPL_RISCVV_MUL_SAT(
v_int16x8, 32, vwmul_vv_i32m2, vnclip_wx_i16m1)
2051OPENCV_HAL_IMPL_RISCVV_MUL_SAT(
v_uint16x8, 32, vwmulu_vv_u32m2, vnclipu_wx_u16m1)
2054static const signed char popCountTable[256] =
2056 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2057 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2058 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2059 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2060 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2061 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2062 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2063 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2064 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2065 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2066 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2067 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2068 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2069 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2070 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2071 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2074inline vuint8m1_t vcnt_u8(vuint8m1_t val){
2075#if __riscv_v == 7000
2076 vuint8m1_t v0 = vand_vx_u8m1(val, 1, 16);
2077 return vadd_vv_u8m1(vloxei8_v_u8m1((
unsigned char*)popCountTable, vsrl_vx_u8m1(val, 1, 16), 16), v0, 16);
2079 return vloxei8_v_u8m1((
unsigned char*)popCountTable, val, 16);
2092 return v_uint8x16(vcnt_u8(vreinterpret_v_i8m1_u8m1(a.val)));
2098 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u16m1_u8m1(a.val));
2099 vuint8m1_t
seq = vid_v_u8m1(8);
2100 vuint8m1_t
index = vsll_vx_u8m1(
seq, 1, 8);
2101 return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp,
index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(
index, 1, 8), 8), 8), 0));
2107 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(a.val)));
2108 vuint8m1_t
seq = vid_v_u8m1(8);
2109 vuint8m1_t
index = vsll_vx_u8m1(
seq, 1, 8);
2110 return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp,
index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(
index, 1, 8), 8), 8), 0));
2116 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u32m1_u8m1(a.val));
2117 vuint8m1_t
seq = vid_v_u8m1(8);
2118 vuint8m1_t
index = vsll_vx_u8m1(
seq, 1, 8);
2119 vuint8m1_t
sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp,
index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(
index, 1, 8), 8), 8);
2120 return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(
sum,
index, 4), vrgather_vv_u8m1(
sum, vadd_vx_u8m1(
index, 1, 4), 4), 4), 0, 4), 0));
2126 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(a.val)));
2127 vuint8m1_t
seq = vid_v_u8m1(8);
2128 vuint8m1_t
index = vsll_vx_u8m1(
seq, 1, 8);
2129 vuint8m1_t
sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp,
index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(
index, 1, 8), 8), 8);
2130 return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(
sum,
index, 4), vrgather_vv_u8m1(
sum, vadd_vx_u8m1(
index, 1, 4), 4), 4), 0, 4), 0));
2136 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u64m1_u8m1(a.val));
2137 vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
2138 vuint16m1_t res1 = vundefined_u16m1();
2139 vuint16m1_t res2 = vundefined_u16m1();
2140 res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
2141 res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
2142 return v_uint64x2((
unsigned long)vmv_x_s_u16m1_u16(res1), (
unsigned long)vmv_x_s_u16m1_u16(res2));
2148 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(a.val)));
2149 vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
2150 vuint16m1_t res1 = vundefined_u16m1(), res2 = vundefined_u16m1();
2151 res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
2152 res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
2153 return v_uint64x2((
unsigned long)vmv_x_s_u16m1_u16(res1), (
unsigned long)vmv_x_s_u16m1_u16(res2));
2156#define SMASK 1, 2, 4, 8, 16, 32, 64, 128
2159 vuint16m1_t res = vundefined_u16m1();
2160 vuint8m1_t
id = vid_v_u8m1(16);
2161 vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(
id, 0, 16), 16);
2162 vuint8m1_t t0 = vsrl_vx_u8m1(a.val, 7, 16);
2163 vbool8_t
mask = vmseq_vx_u8m1_b8(t0, 1, 16);
2164 res = vredsum_vs_u16m2_u16m1_m(
mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2165 return vmv_x_s_u16m1_u16(res);
2169 vuint16m1_t res = vundefined_u16m1();
2170 vuint8m1_t
id = vid_v_u8m1(16);
2171 vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(
id, 0, 16), 16);
2172 vbool8_t
mask = vmslt_vx_i8m1_b8(a.val, 0, 16);
2173 res = vredsum_vs_u16m2_u16m1_m(
mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2174 return vmv_x_s_u16m1_u16(res);
2179 vuint16m1_t res = vundefined_u16m1();
2180 vuint16m1_t
id = vid_v_u16m1(8);
2181 vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8),
id, 8);
2182 vbool16_t
mask = vmslt_vx_i16m1_b16(a.val, 0, 8);
2183 res = vredsum_vs_u16m1_u16m1_m(
mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2184 return vmv_x_s_u16m1_u16(res);
2188 vuint16m1_t res = vundefined_u16m1();
2189 vuint16m1_t
id = vid_v_u16m1(8);
2190 vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8),
id, 8);
2191 vuint16m1_t t0 = vsrl_vx_u16m1(a.val, 15, 8);
2192 vbool16_t
mask = vmseq_vx_u16m1_b16(t0, 1, 8);
2193 res = vredsum_vs_u16m1_u16m1_m(
mask, res, num, vmv_v_x_u16m1(0, 8), 8);
2194 return vmv_x_s_u16m1_u16(res);
2198 vuint32m1_t res = vundefined_u32m1();
2199 vuint32m1_t
id = vid_v_u32m1(4);
2200 vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4),
id, 4);
2201 vbool32_t
mask = vmslt_vx_i32m1_b32(a.val, 0, 4);
2202 res = vredsum_vs_u32m1_u32m1_m(
mask, res, num, vmv_v_x_u32m1(0, 4), 4);
2203 return vmv_x_s_u32m1_u32(res);
2207 vuint32m1_t res = vundefined_u32m1();
2208 vuint32m1_t
id = vid_v_u32m1(4);
2209 vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4),
id, 4);
2210 vuint32m1_t t0 = vsrl_vx_u32m1(a.val, 31, 4);
2211 vbool32_t
mask = vmseq_vx_u32m1_b32(t0, 1, 4);
2212 res = vredsum_vs_u32m1_u32m1_m(
mask, res, num, vmv_v_x_u32m1(0, 4), 4);
2213 return vmv_x_s_u32m1_u32(res);
2217 vuint64m1_t res = vundefined_u64m1();
2218 vuint64m1_t
id = vid_v_u64m1(2);
2219 vuint64m1_t num = vsll_vv_u64m1(vmv_v_x_u64m1(1, 2),
id, 2);
2220 vuint64m1_t t0 = vsrl_vx_u64m1(a.val, 63, 2);
2221 vbool64_t
mask = vmseq_vx_u64m1_b64(t0, 1, 2);
2222 res = vredsum_vs_u64m1_u64m1_m(
mask, res, num, vmv_v_x_u64m1(0, 2), 2);
2223 return vmv_x_s_u64m1_u64(res);
2226{
return v_signmask(v_reinterpret_as_u64(a)); }
2228{
return v_signmask(v_reinterpret_as_u64(a)); }
2244else return trailingZeros32(val); }
2248else return trailingZeros32(val); }
2252else return trailingZeros32(val); }
2256else return trailingZeros32(val); }
2260else return trailingZeros32(val); }
2264else return trailingZeros32(val); }
2268else return trailingZeros32(val); }
2272else return trailingZeros32(val); }
2276else return trailingZeros32(val); }
2278#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num, mask_b) \
2279inline bool v_check_all(const v_##_Tpvec& a) \
2281 suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
2282 return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) == 0; \
2284inline bool v_check_any(const v_##_Tpvec& a) \
2286 suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
2287 return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) != 0; \
2290OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16, b8)
2291OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8, b16)
2292OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4, b32)
2293OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2, b64)
2321#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num, mask_func) \
2322inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
2324 return _Tpvec(vmerge_vvm_##suffix(mask_func(mask.val, 0, num), b.val, a.val, num)); \
2327OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int8x16, i8m1, vbool8_t, 16, vmsne_vx_i8m1_b8)
2328OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int16x8, i16m1, vbool16_t, 8, vmsne_vx_i16m1_b16)
2329OPENCV_HAL_IMPL_RISCVV_SELECT(
v_int32x4, i32m1, vbool32_t, 4, vmsne_vx_i32m1_b32)
2330OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint8x16, u8m1, vbool8_t, 16, vmsne_vx_u8m1_b8)
2331OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint16x8, u16m1, vbool16_t, 8, vmsne_vx_u16m1_b16)
2332OPENCV_HAL_IMPL_RISCVV_SELECT(
v_uint32x4, u32m1, vbool32_t, 4, vmsne_vx_u32m1_b32)
2335 return v_float32x4(vmerge_vvm_f32m1(vmfne_vf_f32m1_b32(
mask.val, 0, 4), b.val, a.val, 4));
2339 return v_float64x2(vmerge_vvm_f64m1(vmfne_vf_f64m1_b64(
mask.val, 0, 2), b.val, a.val, 2));
2342#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2, num3) \
2343inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
2345 _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1); \
2346 b0.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 0); \
2347 b1.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 1); \
2349inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
2351 _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num2); \
2352 return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
2354inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
2356 _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1); \
2357 return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 1)); \
2359inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
2361 _T2##_t val = vle##num3##_v_##_Tp1(ptr, num2); \
2362 _T1##_t b = vw##add##_vx_##_Tp2##m2(val, 0, num2); \
2363 return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
2366OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint8x16, uint16x8,
uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1, 8)
2367OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint16x8, uint32x4,
ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1, 16)
2368OPENCV_HAL_IMPL_RISCVV_EXPAND(addu,
v_uint32x4, uint64x2,
uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1, 32)
2369OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int8x16, int16x8,
schar, i8m1, 16, i16, 8, vint16m2, vint8m1, 8)
2370OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int16x8, int32x4,
short, i16m1, 8, i32, 4, vint32m2, vint16m1, 16)
2371OPENCV_HAL_IMPL_RISCVV_EXPAND(
add,
v_int32x4, int64x2,
int, i32m1, 4, i64, 2, vint64m2, vint32m1, 32)
2375 vuint16m2_t b = vundefined_u16m2();
2376 vuint32m2_t c = vundefined_u32m2();
2377 vuint8m1_t val = vle8_v_u8m1(ptr, 4); \
2378 b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4); \
2379 c = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \
2385 vint16m2_t b = vundefined_i16m2();
2386 vint32m2_t c = vundefined_i32m2();
2387 vint8m1_t val = vle8_v_i8m1(ptr, 4); \
2388 b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4); \
2389 c = vwadd_vv_i32m2(vget_v_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \
2390 return v_int32x4(vget_v_i32m2_i32m1(c, 0));
2392#define VITL_16 {0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
2393#define VITL_8 {0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
2394#define VITL_4 {0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
2395#define VITL_2 {0, 0, 2, 0, 1, 0, 3, 0}
2397#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh, refunc) \
2398inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2400 v##_Tp##m2_t tmp = vundefined_##_T##m2();\
2401 tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a0.val); \
2402 tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a1.val); \
2403 unsigned mdata[] = VITL_##num; \
2404 vuint32m2_t mask = vle32_v_u32m2(mdata, 8); \
2405 tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, refunc(mask), num2); \
2406 b0.val = vget_v_##_T##m2_##_T##m1(tmp, 0); \
2407 b1.val = vget_v_##_T##m2_##_T##m1(tmp, 1); \
2409inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2411 v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2412 return v_##_Tpvec(b0);\
2414inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2416 v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
2417 v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
2418 v##_Tp##m1_t b1 = vundefined_##_T##m1(); \
2419 b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num); \
2420 a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num); \
2421 b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2422 return v_##_Tpvec(b1);\
2424inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2426 v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
2427 v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
2428 c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2429 b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num); \
2430 a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num); \
2431 d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2434OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
2435OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
2436OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
2437OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
2438OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2,)
2439OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2,)
2440OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2,)
2441OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64,
uint64, u64, 2, 4, b64, 1, vreinterpret_v_u32m2_u64m2)
2445 return v_uint8x16(vrgather_vv_u8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
2449 return v_int8x16(vrgather_vv_i8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
2454 return v_uint16x8(vrgather_vv_u16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
2459 return v_int16x8(vrgather_vv_i16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
2463 return v_uint32x4(vrgather_vv_u32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
2468 return v_int32x4(vrgather_vv_i32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
2472{
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
2476 return v_uint64x2(vrgather_vv_u64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2481 return v_int64x2(vrgather_vv_i64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2486 return v_float64x2(vrgather_vv_f64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2489#define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
2491inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2492{ return v_rotate_right<n>(a, b);}
2493OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint8x16, u8, 0)
2494OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int8x16, s8, 0)
2495OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint16x8, u16, 1)
2496OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int16x8, s16, 1)
2497OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint32x4, u32, 2)
2498OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int32x4, s32, 2)
2499OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_uint64x2, u64, 3)
2500OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_int64x2, s64, 3)
2501OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_float32x4, f32, 2)
2502OPENCV_HAL_IMPL_RISCVV_EXTRACT(
v_float64x2, f64, 3)
2505#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix, vtype, _vtype, num, mvfunc) \
2506template<int i> inline _Tp v_extract_n(_Tpvec v) { vtype tmp = vundefined_##_vtype(); return mvfunc(vslidedown_vx_##_vtype(tmp, v.val, i, num)); }
2508OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint8x16,
uchar, u8, vuint8m1_t, u8m1, 16, vmv_x_s_u8m1_u8)
2509OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int8x16,
schar, s8, vint8m1_t, i8m1, 16, vmv_x_s_i8m1_i8)
2510OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint16x8,
ushort, u16, vuint16m1_t, u16m1, 8, vmv_x_s_u16m1_u16)
2511OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int16x8,
short, s16, vint16m1_t, i16m1, 8, vmv_x_s_i16m1_i16)
2512OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint32x4,
uint, u32, vuint32m1_t, u32m1, 4, vmv_x_s_u32m1_u32)
2513OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int32x4,
int, s32, vint32m1_t, i32m1, 4, vmv_x_s_i32m1_i32)
2514OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_uint64x2,
uint64, u64, vuint64m1_t, u64m1, 2, vmv_x_s_u64m1_u64)
2515OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_int64x2,
int64, s64, vint64m1_t, i64m1, 2, vmv_x_s_i64m1_i64)
2516OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_float32x4,
float, f32, vfloat32m1_t, f32m1, 4, vfmv_f_s_f32m1_f32)
2517OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(
v_float64x2,
double, f64, vfloat64m1_t, f64m1, 2, vfmv_f_s_f64m1_f64)
2519#define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
2520template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
2522OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint8x16, u8, 16)
2523OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int8x16, i8, 16)
2524OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint16x8, u16, 8)
2525OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int16x8, i16, 8)
2526OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint32x4, u32, 4)
2527OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int32x4, i32, 4)
2528OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_uint64x2, u64, 2)
2529OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_int64x2, i64, 2)
2530OPENCV_HAL_IMPL_RISCVV_BROADCAST(
v_float32x4, f32, 4)
2532inline
void __builtin_riscv_fsrm(
int val)
2534 asm(
"csrw frm, %0\n\t"
2540inline void barrier1(
void *arg) {
2541 __asm__ __volatile__(
"" : :
"r" (arg) :
"memory");
2546 __builtin_riscv_fsrm(0);
2547 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2549 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2550 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2551 __builtin_riscv_fsrm(0);
2556 __builtin_riscv_fsrm(2);
2557 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2559 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2560 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2561 __builtin_riscv_fsrm(0);
2567 __builtin_riscv_fsrm(3);
2568 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2570 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2571 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2572 __builtin_riscv_fsrm(0);
2578 __builtin_riscv_fsrm(1);
2579 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2581 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2582 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2583 __builtin_riscv_fsrm(0);
2589 __builtin_riscv_fsrm(0);
2590 vfloat64m2_t _val = vundefined_f64m2();
2591 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2593 _val = vset_v_f64m1_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
2595 vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
2596 __builtin_riscv_fsrm(0);
2601 __builtin_riscv_fsrm(0);
2602 vfloat64m2_t _val = vundefined_f64m2();
2603 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2604 _val = vset_v_f64m1_f64m2(_val, 1, b.val);
2606 vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
2607 __builtin_riscv_fsrm(0);
2612 __builtin_riscv_fsrm(2);
2613 vfloat64m2_t _val = vundefined_f64m2();
2614 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2615 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2616 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2618 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2619 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), aval, 4);
2620 __builtin_riscv_fsrm(0);
2626 __builtin_riscv_fsrm(3);
2627 vfloat64m2_t _val = vundefined_f64m2();
2628 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2629 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2630 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2632 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2633 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), aval, 4);
2634 __builtin_riscv_fsrm(0);
2640 __builtin_riscv_fsrm(1);
2641 vfloat64m2_t _val = vundefined_f64m2();
2642 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2643 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2644 vint32m1_t
nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2646 vbool32_t
mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2647 vint32m1_t val = vfcvt_x_f_v_i32m1_m(
mask, vmv_v_x_i32m1(0, 4), aval, 4);
2648 __builtin_riscv_fsrm(0);
2652#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize) \
2653inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2655 intrin##2e##elemsize##_v_##_T##m1(&a.val, &b.val, ptr, num); \
2657inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2659 intrin##3e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num); \
2661inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2662 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2664 intrin##4e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num); \
2667#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize) \
2668inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2669 hal::StoreMode =hal::STORE_UNALIGNED) \
2671 intrin##2e##elemsize##_v_##_T##m1(ptr, a.val, b.val, num); \
2673inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2674 const v_##_Tpvec##x##num& c, hal::StoreMode =hal::STORE_UNALIGNED) \
2676 intrin##3e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, num); \
2678inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2679 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2680 hal::StoreMode =hal::STORE_UNALIGNED ) \
2682 intrin##4e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num); \
2685#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T, elemsize) \
2686OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T, elemsize) \
2687OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T, elemsize)
2690OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8,
schar, 16, vlseg, vsseg, i8, 8)
2691OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16,
short, 8, vlseg, vsseg, i16, 16)
2692OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32,
int, 4, vlseg, vsseg, i32, 32)
2694OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8,
unsigned char, 16, vlseg, vsseg, u8, 8)
2695OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16,
unsigned short, 8, vlseg, vsseg, u16, 16)
2696OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32,
unsigned int, 4, vlseg, vsseg, u32, 32)
2698#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T, _esize) \
2699inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2700{ vlseg2e##_esize##_v_##_T##m1(&a.val, &b.val, ptr, num);} \
2701inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2702{ vlseg3e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num);}\
2703inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2704 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2705{ vlseg4e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num);} \
2706inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2707 hal::StoreMode =hal::STORE_UNALIGNED) \
2708{ vsseg2e##_esize##_v_##_T##m1(ptr, a.val, b.val, num);} \
2709inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2710 const v_##_Tpvec##x##num& c, hal::StoreMode =hal::STORE_UNALIGNED) \
2711{ vsseg3e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, num);} \
2712inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2713 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2714 hal::StoreMode =hal::STORE_UNALIGNED ) \
2715{ vsseg4e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num);}
2717OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32,
float, 4, f32, 32)
2718OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64,
double, 2, f64, 64)
2720OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(
uint64,
unsigned long, 2, u64, 64)
2721OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(
int64,
long, 2, i64, 64)
2731 vfloat64m2_t _val = vundefined_f64m2();
2732 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2733 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2739 vfloat64m2_t _val = vundefined_f64m2();
2740 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2741 _val = vset_v_f64m1_f64m2(_val, 1, b.val);
2742 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 4);
2748 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2749 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2755 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2756 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2762 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2768 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2780 uint64 mdata[2] = {0x0705060403010200, 0x0F0D0E0C0B090A08};
2781 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2782 return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
2791 uint64 mdata[2] = {0x0703060205010400, 0x0F0B0E0A0D090C08};
2792 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2793 return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
2802 uint64 mdata[2] = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
2803 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2804 return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2809 uint64 mdata[2] = {0x0B0A030209080100, 0x0F0E07060D0C0504};
2810 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2811 return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2817 uint64 mdata[2] = {0x0B0A090803020100, 0x0F0E0D0C07060504};
2818 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2819 return v_int32x4(vreinterpret_v_i8m1_i32m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2825 uint64 mdata[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
2826 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2827 return v_int8x16(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vec.val), vreinterpret_v_u64m1_u8m1(m0), 16)));
2833 uint64 mdata[2] = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
2834 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2835 return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2851 vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
2852 vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2), 2);
2860#if __riscv_v == 7000
2863 vfloat16m1_t v = vle16_v_f16m1((__fp16*)ptr, 4);
2864 vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
2870 vfloat32m2_t v32 = vundefined_f32m2();
2871 v32 = vset_v_f32m1_f32m2(v32, 0, v.val);
2872 vfloat16m1_t hv = vfncvt_f_f_w_f16m1(v32, 4);
2873 vse16_v_f16m1((__fp16*)ptr, hv, 4);
2878 vfloat16mf2_t v = vle16_v_f16mf2((__fp16*)ptr, 4);
2879 vfloat32m1_t v32 = vfwcvt_f_f_v_f32m1(v, 4);
2887 vfloat16mf2_t hv = vfncvt_f_f_w_f16mf2(v.val, 4);
2888 vse16_v_f16mf2((__fp16*)ptr, hv, 4);
2894CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
static bool operator!=(const Matx< _Tp, m, n > &a, const Matx< _Tp, m, n > &b)
static bool operator==(const Matx< _Tp, m, n > &a, const Matx< _Tp, m, n > &b)
const int * idx
Definition core_c.h:668
int index
Definition core_c.h:634
const CvSeq * seq
Definition core_c.h:1548
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
CV_INLINE v_reg< _Tp, n > & operator/=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition intrin_cpp.hpp:1033
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition intrin_cpp.hpp:1020
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition intrin_cpp.hpp:1451
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition intrin_cpp.hpp:1961
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
OutputArray sum
Definition imgproc.hpp:2882
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441
static bool operator<(const FileNodeIterator &it1, const FileNodeIterator &it2)
Definition persistence.hpp:1303