EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_rvv071.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html
4
5// Copyright (C) 2015, PingTouGe Semiconductor Co., Ltd., all rights reserved.
6
7#ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
8#define OPENCV_HAL_INTRIN_RISCVV_HPP
9
10#include <float.h>
11#include <algorithm>
12#include "opencv2/core/utility.hpp"
13
14namespace cv
15{
16
18
19CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
20
21#define CV_SIMD128 1
22#define CV_SIMD128_64F 1
24struct v_uint8x16
25{
26 typedef uchar lane_type;
27 enum { nlanes = 16 };
28
29 v_uint8x16() {}
30 explicit v_uint8x16(vuint8m1_t v) : val(v) {}
31 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
32 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
33 {
34 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
35 val = (vuint8m1_t)vle8_v_u8m1((unsigned char*)v, 16);
36 }
37 uchar get0() const
38 {
39 return vmv_x_s_u8m1_u8(val);
40 }
41
42 vuint8m1_t val;
43};
44
45struct v_int8x16
46{
47 typedef schar lane_type;
48 enum { nlanes = 16 };
49
50 v_int8x16() {}
51 explicit v_int8x16(vint8m1_t v) : val(v) {}
52 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
54 {
55 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56 val = (vint8m1_t)vle8_v_i8m1((schar*)v, 16);
57 }
58 schar get0() const
59 {
60 return vmv_x_s_i8m1_i8(val);
61 }
62
63 vint8m1_t val;
64};
65
66struct v_uint16x8
67{
68 typedef ushort lane_type;
69 enum { nlanes = 8 };
70
71 v_uint16x8() {}
72 explicit v_uint16x8(vuint16m1_t v) : val(v) {}
73 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
74 {
75 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
76 val = (vuint16m1_t)vle16_v_u16m1((unsigned short*)v, 8);
77 }
78 ushort get0() const
79 {
80 return vmv_x_s_u16m1_u16(val);
81 }
82
83 vuint16m1_t val;
84};
85
86struct v_int16x8
87{
88 typedef short lane_type;
89 enum { nlanes = 8 };
90
91 v_int16x8() {}
92 explicit v_int16x8(vint16m1_t v) : val(v) {}
93 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
94 {
95 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96 val = (vint16m1_t)vle16_v_i16m1((signed short*)v, 8);
97 }
98 short get0() const
99 {
100 return vmv_x_s_i16m1_i16(val);
101 }
102
103 vint16m1_t val;
104};
105
106struct v_uint32x4
107{
108 typedef unsigned lane_type;
109 enum { nlanes = 4 };
110
111 v_uint32x4() {}
112 explicit v_uint32x4(vuint32m1_t v) : val(v) {}
113 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
114 {
115 unsigned v[] = {v0, v1, v2, v3};
116 val = (vuint32m1_t)vle32_v_u32m1((unsigned int*)v, 4);
117 }
118 unsigned get0() const
119 {
120 return vmv_x_s_u32m1_u32(val);
121 }
122
123 vuint32m1_t val;
124};
125
126struct v_int32x4
127{
128 typedef int lane_type;
129 enum { nlanes = 4 };
130
131 v_int32x4() {}
132 explicit v_int32x4(vint32m1_t v) : val(v) {}
133 v_int32x4(int v0, int v1, int v2, int v3)
134 {
135 int v[] = {v0, v1, v2, v3};
136 val = (vint32m1_t)vle32_v_i32m1((signed int*)v, 4);
137 }
138 int get0() const
139 {
140 return vmv_x_s_i32m1_i32(val);
141 }
142 vint32m1_t val;
143};
144
145struct v_float32x4
146{
147 typedef float lane_type;
148 enum { nlanes = 4 };
149
150 v_float32x4() {}
151 explicit v_float32x4(vfloat32m1_t v) : val(v) {}
152 v_float32x4(float v0, float v1, float v2, float v3)
153 {
154 float v[] = {v0, v1, v2, v3};
155 val = (vfloat32m1_t)vle32_v_f32m1((float*)v, 4);
156 }
157 float get0() const
158 {
159 return vfmv_f_s_f32m1_f32(val);
160 }
161 vfloat32m1_t val;
162};
163
164struct v_uint64x2
165{
166 typedef uint64 lane_type;
167 enum { nlanes = 2 };
168
169 v_uint64x2() {}
170 explicit v_uint64x2(vuint64m1_t v) : val(v) {}
171 v_uint64x2(uint64 v0, uint64 v1)
172 {
173 uint64 v[] = {v0, v1};
174 val = (vuint64m1_t)vle64_v_u64m1((unsigned long*)v, 2);
175 }
176 uint64 get0() const
177 {
178 return vmv_x_s_u64m1_u64(val);
179 }
180 vuint64m1_t val;
181};
182
183struct v_int64x2
184{
185 typedef int64 lane_type;
186 enum { nlanes = 2 };
187
188 v_int64x2() {}
189 explicit v_int64x2(vint64m1_t v) : val(v) {}
190 v_int64x2(int64 v0, int64 v1)
191 {
192 int64 v[] = {v0, v1};
193 val = (vint64m1_t)vle64_v_i64m1((long*)v, 2);
194 }
195 int64 get0() const
196 {
197 return vmv_x_s_i64m1_i64(val);
198 }
199 vint64m1_t val;
200};
201
202struct v_float64x2
203{
204 typedef double lane_type;
205 enum { nlanes = 2 };
206
207 v_float64x2() {}
208 explicit v_float64x2(vfloat64m1_t v) : val(v) {}
209 v_float64x2(double v0, double v1)
210 {
211 double v[] = {v0, v1};
212 val = (vfloat64m1_t)vle64_v_f64m1((double*)v, 2);
213 }
214 double get0() const
215 {
216 return vfmv_f_s_f64m1_f64(val);
217 }
218 vfloat64m1_t val;
219};
220/*
221#define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
222inline _Tp##m1_t vreinterpret_v_##suffix##m1_##suffix##m1(_Tp##m1_t v) { return v; } \
223inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
224inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
225inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
226inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); } \
227inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
228inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
229inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
230inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \
231inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\
232inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); }
233
234
235OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
236OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, i8)
237OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
238OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, i16)
239OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
240OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, i32)
241OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
242OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, i64)
243OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
244OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
245*/
246inline v_uint8x16 v_reinterpret_as_u8(const v_uint8x16& v) { return v_uint8x16(v.val); }
247inline v_int8x16 v_reinterpret_as_s8(const v_uint8x16& v) { return v_int8x16(vreinterpret_v_u8m1_i8m1(v.val)); }
248inline v_uint16x8 v_reinterpret_as_u16(const v_uint8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(v.val)); }
249inline v_int16x8 v_reinterpret_as_s16(const v_uint8x16& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(vreinterpret_v_u8m1_u16m1(v.val))); }
250inline v_uint32x4 v_reinterpret_as_u32(const v_uint8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(v.val)); }
251inline v_int32x4 v_reinterpret_as_s32(const v_uint8x16& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
252inline v_uint64x2 v_reinterpret_as_u64(const v_uint8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(v.val)); }
253inline v_int64x2 v_reinterpret_as_s64(const v_uint8x16& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
254inline v_float32x4 v_reinterpret_as_f32(const v_uint8x16& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
255inline v_float64x2 v_reinterpret_as_f64(const v_uint8x16& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
256
257inline v_uint8x16 v_reinterpret_as_u8(const v_int8x16& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(v.val)); }
258inline v_int8x16 v_reinterpret_as_s8(const v_int8x16& v) { return v_int8x16(v.val); }
259inline v_uint16x8 v_reinterpret_as_u16(const v_int8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(vreinterpret_v_i8m1_u8m1(v.val))); }
260inline v_int16x8 v_reinterpret_as_s16(const v_int8x16& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); }
261inline v_uint32x4 v_reinterpret_as_u32(const v_int8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(vreinterpret_v_i8m1_u8m1(v.val))); }
262inline v_int32x4 v_reinterpret_as_s32(const v_int8x16& v) { return v_int32x4(vreinterpret_v_i8m1_i32m1(v.val)); }
263inline v_uint64x2 v_reinterpret_as_u64(const v_int8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(vreinterpret_v_i8m1_u8m1(v.val))); }
264inline v_int64x2 v_reinterpret_as_s64(const v_int8x16& v) { return v_int64x2(vreinterpret_v_i8m1_i64m1(v.val)); }
265inline v_float32x4 v_reinterpret_as_f32(const v_int8x16& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i8m1_i32m1(v.val))); }
266inline v_float64x2 v_reinterpret_as_f64(const v_int8x16& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i8m1_i64m1(v.val))); }
267
268inline v_uint8x16 v_reinterpret_as_u8(const v_uint16x8& v) { return v_uint8x16(vreinterpret_v_u16m1_u8m1(v.val)); }
269inline v_int8x16 v_reinterpret_as_s8(const v_uint16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(vreinterpret_v_u16m1_i16m1(v.val))); }
270inline v_uint16x8 v_reinterpret_as_u16(const v_uint16x8& v) { return v_uint16x8(v.val); }
271inline v_int16x8 v_reinterpret_as_s16(const v_uint16x8& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(v.val)); }
272inline v_uint32x4 v_reinterpret_as_u32(const v_uint16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(v.val)); }
273inline v_int32x4 v_reinterpret_as_s32(const v_uint16x8& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
274inline v_uint64x2 v_reinterpret_as_u64(const v_uint16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(v.val)); }
275inline v_int64x2 v_reinterpret_as_s64(const v_uint16x8& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
276inline v_float32x4 v_reinterpret_as_f32(const v_uint16x8& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
277inline v_float64x2 v_reinterpret_as_f64(const v_uint16x8& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
278
279inline v_uint8x16 v_reinterpret_as_u8(const v_int16x8& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(v.val))); }
280inline v_int8x16 v_reinterpret_as_s8(const v_int16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(v.val)); }
281inline v_uint16x8 v_reinterpret_as_u16(const v_int16x8& v) { return v_uint16x8(vreinterpret_v_i16m1_u16m1(v.val)); }
282inline v_int16x8 v_reinterpret_as_s16(const v_int16x8& v) { return v_int16x8(v.val); }
283inline v_uint32x4 v_reinterpret_as_u32(const v_int16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(vreinterpret_v_i16m1_u16m1(v.val))); }
284inline v_int32x4 v_reinterpret_as_s32(const v_int16x8& v) { return v_int32x4(vreinterpret_v_i16m1_i32m1(v.val)); }
285inline v_uint64x2 v_reinterpret_as_u64(const v_int16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(vreinterpret_v_i16m1_u16m1(v.val))); }
286inline v_int64x2 v_reinterpret_as_s64(const v_int16x8& v) { return v_int64x2(vreinterpret_v_i16m1_i64m1(v.val)); }
287inline v_float32x4 v_reinterpret_as_f32(const v_int16x8& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i16m1_i32m1(v.val))); }
288inline v_float64x2 v_reinterpret_as_f64(const v_int16x8& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i16m1_i64m1(v.val))); }
289
290inline v_uint8x16 v_reinterpret_as_u8(const v_uint32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(v.val)); }
291inline v_int8x16 v_reinterpret_as_s8(const v_uint32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_u32m1_i32m1(v.val))); }
292inline v_uint16x8 v_reinterpret_as_u16(const v_uint32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(v.val)); }
293inline v_int16x8 v_reinterpret_as_s16(const v_uint32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_u32m1_i32m1(v.val))); }
294inline v_uint32x4 v_reinterpret_as_u32(const v_uint32x4& v) { return v_uint32x4(v.val); }
295inline v_int32x4 v_reinterpret_as_s32(const v_uint32x4& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(v.val)); }
296inline v_uint64x2 v_reinterpret_as_u64(const v_uint32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(v.val)); }
297inline v_int64x2 v_reinterpret_as_s64(const v_uint32x4& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
298inline v_float32x4 v_reinterpret_as_f32(const v_uint32x4& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(v.val)); }
299inline v_float64x2 v_reinterpret_as_f64(const v_uint32x4& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
300
301inline v_uint8x16 v_reinterpret_as_u8(const v_int32x4& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(v.val))); }
302inline v_int8x16 v_reinterpret_as_s8(const v_int32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(v.val)); }
303inline v_uint16x8 v_reinterpret_as_u16(const v_int32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_i32m1_u32m1(v.val))); }
304inline v_int16x8 v_reinterpret_as_s16(const v_int32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(v.val)); }
305inline v_uint32x4 v_reinterpret_as_u32(const v_int32x4& v) { return v_uint32x4(vreinterpret_v_i32m1_u32m1(v.val)); }
306inline v_int32x4 v_reinterpret_as_s32(const v_int32x4& v) { return v_int32x4(v.val); }
307inline v_uint64x2 v_reinterpret_as_u64(const v_int32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_i32m1_u32m1(v.val))); }
308inline v_int64x2 v_reinterpret_as_s64(const v_int32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(v.val)); }
309inline v_float32x4 v_reinterpret_as_f32(const v_int32x4& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(v.val)); }
310inline v_float64x2 v_reinterpret_as_f64(const v_int32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(v.val))); }
311
312inline v_uint8x16 v_reinterpret_as_u8(const v_uint64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(v.val)); }
313inline v_int8x16 v_reinterpret_as_s8(const v_uint64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_u64m1_i64m1(v.val))); }
314inline v_uint16x8 v_reinterpret_as_u16(const v_uint64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(v.val)); }
315inline v_int16x8 v_reinterpret_as_s16(const v_uint64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_u64m1_i64m1(v.val))); }
316inline v_uint32x4 v_reinterpret_as_u32(const v_uint64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(v.val)); }
317inline v_int32x4 v_reinterpret_as_s32(const v_uint64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_u64m1_i64m1(v.val))); }
318inline v_uint64x2 v_reinterpret_as_u64(const v_uint64x2& v) { return v_uint64x2(v.val); }
319inline v_int64x2 v_reinterpret_as_s64(const v_uint64x2& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(v.val)); }
320inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(v.val))); }
321inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(v.val)); }
322
323inline v_uint8x16 v_reinterpret_as_u8(const v_int64x2& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(v.val))); }
324inline v_int8x16 v_reinterpret_as_s8(const v_int64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(v.val)); }
325inline v_uint16x8 v_reinterpret_as_u16(const v_int64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_i64m1_u64m1(v.val))); }
326inline v_int16x8 v_reinterpret_as_s16(const v_int64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(v.val)); }
327inline v_uint32x4 v_reinterpret_as_u32(const v_int64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_i64m1_u64m1(v.val))); }
328inline v_int32x4 v_reinterpret_as_s32(const v_int64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(v.val)); }
329inline v_uint64x2 v_reinterpret_as_u64(const v_int64x2& v) { return v_uint64x2(vreinterpret_v_i64m1_u64m1(v.val)); }
330inline v_int64x2 v_reinterpret_as_s64(const v_int64x2& v) { return v_int64x2(v.val); }
331inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(v.val))); }
332inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(v.val)); }
333
334inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(vreinterpret_v_f32m1_u32m1(v.val))); }
335inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_f32m1_i32m1(v.val))); }
336inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_f32m1_u32m1(v.val))); }
337inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_f32m1_i32m1(v.val))); }
338inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& v) { return v_uint32x4(vreinterpret_v_f32m1_u32m1(v.val)); }
339inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& v) { return v_int32x4(vreinterpret_v_f32m1_i32m1(v.val)); }
340inline v_uint64x2 v_reinterpret_as_u64(const v_float32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v.val))); }
341inline v_int64x2 v_reinterpret_as_s64(const v_float32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val))); }
342inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& v) { return v_float32x4(v.val); }
343inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val)))); }
344
345inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(vreinterpret_v_f64m1_u64m1(v.val))); }
346inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_f64m1_i64m1(v.val))); }
347inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_f64m1_u64m1(v.val))); }
348inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_f64m1_i64m1(v.val))); }
349inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v.val))); }
350inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val))); }
351inline v_uint64x2 v_reinterpret_as_u64(const v_float64x2& v) { return v_uint64x2(vreinterpret_v_f64m1_u64m1(v.val)); }
352inline v_int64x2 v_reinterpret_as_s64(const v_float64x2& v) { return v_int64x2(vreinterpret_v_f64m1_i64m1(v.val)); }
353inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val)))); }
354inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64x2(v.val); }
355
356#define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
357inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); } \
358inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
359
360OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
361OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
362OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
363OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
364OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
365OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
366OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
367OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
368inline v_float32x4 v_setzero_f32() { return v_float32x4(vfmv_v_f_f32m1(0, 4)); }
369inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
370
371inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
372inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
373
374
375#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
376inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
377{ \
378 return _Tpvec(intrin(a.val, b.val)); \
379} \
380inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
381{ \
382 a.val = intrin(a.val, b.val); \
383 return a; \
384}
385
386#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
387inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
388{ \
389 return _Tpvec(intrin(a.val, b.val, num)); \
390} \
391inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
392{ \
393 a.val = intrin(a.val, b.val, num); \
394 return a; \
395}
396
397OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
398OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
399OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
400OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
401OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
402OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
403OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
404OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
405OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vadd_vv_i32m1, 4)
406OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vsub_vv_i32m1, 4)
407OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
408OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
409OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
410OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
411OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2)
412OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2)
413OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
414OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
415OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
416OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
417OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
418inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
419{
420 return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
421}
423{
424 a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
425 return a;
426}
427
428OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
429OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
430OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
431inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
432{
433 return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
434}
436{
437 a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
438 return a;
439}
440// TODO: exp, log, sin, cos
441
442#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
443inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
444{ \
445 return _Tpvec(intrin(a.val, b.val)); \
446}
447
448#define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
449inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
450{ \
451 return _Tpvec(intrin(a.val, b.val, num)); \
452}
453OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
454OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
455OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
456OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
457OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
458OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
459OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
460OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
461OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
462OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
463OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
464OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
465OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
466OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
467OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
468OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
469
470inline v_float32x4 v_sqrt(const v_float32x4& x)
471{
472 return v_float32x4(vfsqrt_v_f32m1(x.val, 4));
473}
474
475inline v_float32x4 v_invsqrt(const v_float32x4& x)
476{
477 return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4));
478}
479
480inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
481{
482 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
483 return v_sqrt(x);
484}
485
486inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
487{
488 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
489}
490
491inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
492{
493 return v_float32x4(vfmadd_vv_f32m1(a.val, b.val, c.val, 4));
494}
495
496inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
497{
498 return v_int32x4(vmadd_vv_i32m1(a.val, b.val, c.val, 4));
499}
500
501inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
502{
503 return v_fma(a, b, c);
504}
505
506inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
507{
508 return v_fma(a, b, c);
509}
510
511inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
512 const v_float32x4& m1, const v_float32x4& m2,
513 const v_float32x4& m3)
514{
515 vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
516 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
517 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
518 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 3, 4), m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
519 return v_float32x4(res);
520}
521
522inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
523 const v_float32x4& m1, const v_float32x4& m2,
524 const v_float32x4& a)
525{
526 vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
527 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
528 res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
529 res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
530 return v_float32x4(res);
531}
532
533inline v_float64x2 v_sqrt(const v_float64x2& x)
534{
535 return v_float64x2(vfsqrt_v_f64m1(x.val, 2));
536}
537
538inline v_float64x2 v_invsqrt(const v_float64x2& x)
539{
540 return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2));
541}
542
543inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
544{
545 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
546 return v_sqrt(x);
547}
548
549inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
550{
551 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
552}
553
554inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
555{
556 return v_float64x2(vfmadd_vv_f64m1(a.val, b.val, c.val, 2));
557}
558
559inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
560{
561 return v_fma(a, b, c);
562}
563
564#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
565 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
566 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
567 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
568 inline _Tpvec operator ~ (const _Tpvec & a) \
569 { \
570 return _Tpvec(vnot_v_##suffix(a.val, num)); \
571 }
572
573OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint8x16, u8m1, 16)
574OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint16x8, u16m1, 8)
575OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint32x4, u32m1, 4)
576OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint64x2, u64m1, 2)
577OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int8x16, i8m1, 16)
578OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int16x8, i16m1, 8)
579OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4, i32m1, 4)
580OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2)
581
582#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
583inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
584{ \
585 return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
586} \
587inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
588{ \
589 a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
590 return a; \
591}
592
593OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
594OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
595OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
596
597inline v_float32x4 operator ~ (const v_float32x4& a)
598{
599 return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
600}
601
602#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
603inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
604{ \
605 return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
606} \
607inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
608{ \
609 a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
610 return a; \
611}
612
613OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
614OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
615OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
616
617inline v_float64x2 operator ~ (const v_float64x2& a)
618{
619 return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
620}
621inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
622{
623 return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
624}
625inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
626{
627 return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
628}
629
630//#define OPENCV_HAL_IMPL_RISCVV_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
631//inline _Tpuvec v_abs(const _Tpsvec& a) { \
632// E##xm1_t mask=vmflt_vf_e32xm1_f32m1(x.val, 0.0, 4);
633
634//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint8x16, v_int8x16, u8, s8)
635//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint16x8, v_int16x8, u16, s16)
636//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint32x4, v_int32x4, u32, s32)
637
638inline v_uint32x4 v_abs(v_int32x4 x)
639{
640 vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
641 return v_uint32x4(vreinterpret_v_i32m1_u32m1(vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4)));
642}
643
644inline v_uint16x8 v_abs(v_int16x8 x)
645{
646 vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
647 return v_uint16x8(vreinterpret_v_i16m1_u16m1(vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8)));
648}
649
650inline v_uint8x16 v_abs(v_int8x16 x)
651{
652 vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
653 return v_uint8x16(vreinterpret_v_i8m1_u8m1(vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16)));
654}
655
656inline v_float32x4 v_abs(v_float32x4 x)
657{
658 return (v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4);
659}
660
661inline v_float64x2 v_abs(v_float64x2 x)
662{
663 return (v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2);
664}
665
666inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
667{
668 vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
669 return (v_float32x4)vfsgnjx_vv_f32m1(ret, ret, 4);
670}
671
672inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
673{
674 vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
675 return (v_float64x2)vfsgnjx_vv_f64m1(ret, ret, 2);
676}
677
678#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
679inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){ \
680 vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num); \
681 vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num); \
682 return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
683}
684
685OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
686OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
687OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
688
691 vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
692 vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
693 return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
694}
696 vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
697 vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
698 return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
699}
700
701#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
702inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){ \
703 vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
704 vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
705 return v_uint##_Tpvec(vreinterpret_v_i##_Tpv##_u##_Tpv(vsub_vv_i##_Tpv(max, min, num))); \
706}
707
708OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
709OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
710OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
711
712// Multiply and expand
713inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
714 v_int16x8& c, v_int16x8& d)
715{
716 vint16m2_t res = vundefined_i16m2();
717 res = vwmul_vv_i16m2(a.val, b.val, 16);
718 c.val = vget_v_i16m2_i16m1(res, 0);
719 d.val = vget_v_i16m2_i16m1(res, 1);
720}
721
722inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
723 v_uint16x8& c, v_uint16x8& d)
724{
725 vuint16m2_t res = vundefined_u16m2();
726 res = vwmulu_vv_u16m2(a.val, b.val, 16);
727 c.val = vget_v_u16m2_u16m1(res, 0);
728 d.val = vget_v_u16m2_u16m1(res, 1);
729}
730
731inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
732 v_int32x4& c, v_int32x4& d)
733{
734 vint32m2_t res = vundefined_i32m2();
735 res = vwmul_vv_i32m2(a.val, b.val, 8);
736 c.val = vget_v_i32m2_i32m1(res, 0);
737 d.val = vget_v_i32m2_i32m1(res, 1);
738}
739
740inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
741 v_uint32x4& c, v_uint32x4& d)
742{
743 vuint32m2_t res = vundefined_u32m2();
744 res = vwmulu_vv_u32m2(a.val, b.val, 8);
745 c.val = vget_v_u32m2_u32m1(res, 0);
746 d.val = vget_v_u32m2_u32m1(res, 1);
747}
748
749inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
750 v_int64x2& c, v_int64x2& d)
751{
752 vint64m2_t res = vundefined_i64m2();
753 res = vwmul_vv_i64m2(a.val, b.val, 4);
754 c.val = vget_v_i64m2_i64m1(res, 0);
755 d.val = vget_v_i64m2_i64m1(res, 1);
756}
757
758inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
759 v_uint64x2& c, v_uint64x2& d)
760{
761 vuint64m2_t res = vundefined_u64m2();
762 res = vwmulu_vv_u64m2(a.val, b.val, 4);
763 c.val = vget_v_u64m2_u64m1(res, 0);
764 d.val = vget_v_u64m2_u64m1(res, 1);
765}
766
767OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
768OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
769OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
770OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
771OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
772OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
773OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
774OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
775OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
776OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
777OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
778OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
780// 16 >> 32
781inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
782{
783 vuint32m2_t vindex = vundefined_u32m2();
784 vuint32m1_t vindex0 = vid_v_u32m1(4);
785 vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
786 vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
787 vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
788 vint32m2_t res = vundefined_i32m2();
789 res = vwmul_vv_i32m2(a.val, b.val, 8);
790 res = vrgather_vv_i32m2(res, vindex, 8);
791 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0), vget_v_i32m2_i32m1(res, 1), 4));
792}
793inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
794{
795 vuint32m2_t vindex = vundefined_u32m2();
796 vuint32m1_t vindex0 = vid_v_u32m1(4);
797 vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
798 vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
799 vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
800 vint32m2_t res = vundefined_i32m2();
801 res = vwmul_vv_i32m2(a.val, b.val, 8);
802 res = vrgather_vv_i32m2(res, vindex, 8);
803 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0),vget_v_i32m2_i32m1(res, 1), 4), c.val, 4));
804}
805
806// 32 >> 64
807inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
808{
809 vuint64m2_t vindex = vundefined_u64m2();
810 vuint64m1_t vindex0 = vid_v_u64m1(2);
811 vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
812 vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
813 vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
814 vint64m2_t res = vundefined_i64m2();
815 res = vwmul_vv_i64m2(a.val, b.val, 4);
816 res = vrgather_vv_i64m2(res, vindex, 4);
817 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2));
818}
819inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
820{
821 vuint64m2_t vindex = vundefined_u64m2();
822 vuint64m1_t vindex0 = vid_v_u64m1(2);
823 vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
824 vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
825 vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
826 vint64m2_t res = vundefined_i64m2();
827 res = vwmul_vv_i64m2(a.val, b.val, 4);
828 res = vrgather_vv_i64m2(res, vindex, 4);
829 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2), c.val, 2));
830}
831
832// 8 >> 32
833inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
834{
835 vuint32m4_t vindex32 = vundefined_u32m4();
836 vuint32m1_t vindex0 = vid_v_u32m1(4);
837 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
838 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
839 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
840 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
841 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
842 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
843 vuint16m2_t v1 = vundefined_u16m2();
844 vuint32m2_t v2 = vundefined_u32m2();
845 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
846 v1 = vrgather_vv_u16m2(v1, vindex, 16);
847 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
848 return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
849}
850
851inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
852 const v_uint32x4& c)
853{
854 vuint32m4_t vindex32 = vundefined_u32m4();
855 vuint32m1_t vindex0 = vid_v_u32m1(4);
856 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
857 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
858 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
859 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
860 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
861 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
862 vuint16m2_t v1 = vundefined_u16m2();
863 vuint32m2_t v2 = vundefined_u32m2();
864 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
865 v1 = vrgather_vv_u16m2(v1, vindex, 16);
866 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
867 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
868}
869
870inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
871{
872 vuint32m4_t vindex32 = vundefined_u32m4();
873 vuint32m1_t vindex0 = vid_v_u32m1(4);
874 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
875 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
876 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
877 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
878 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
879 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
880 vint16m2_t v1 = vundefined_i16m2();
881 vint32m2_t v2 = vundefined_i32m2();
882 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
883 v1 = vrgather_vv_i16m2(v1, vindex, 16);
884 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
885 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
886}
887
888inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
889 const v_int32x4& c)
890{
891 vuint32m4_t vindex32 = vundefined_u32m4();
892 vuint32m1_t vindex0 = vid_v_u32m1(4);
893 vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
894 vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
895 vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
896 vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
897 vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
898 vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
899 vint16m2_t v1 = vundefined_i16m2();
900 vint32m2_t v2 = vundefined_i32m2();
901 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
902 v1 = vrgather_vv_i16m2(v1, vindex, 16);
903 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
904 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
905}
906
907inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
908{
909 vuint64m4_t vindex64 = vundefined_u64m4();
910 vuint64m1_t vindex0 = vid_v_u64m1(2);
911 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
912 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
913 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
914 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
915 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
916 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
917 vuint32m2_t v1 = vundefined_u32m2();
918 vuint64m2_t v2 = vundefined_u64m2();
919 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
920 v1 = vrgather_vv_u32m2(v1, vindex, 8);
921 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
922 return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
923}
924
925inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
926 const v_uint64x2& c)
927{
928 vuint64m4_t vindex64 = vundefined_u64m4();
929 vuint64m1_t vindex0 = vid_v_u64m1(2);
930 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
931 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
932 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
933 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
934 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
935 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
936 vuint32m2_t v1 = vundefined_u32m2();
937 vuint64m2_t v2 = vundefined_u64m2();
938 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
939 v1 = vrgather_vv_u32m2(v1, vindex, 8);
940 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
941 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
942}
943
944inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
945{
946 vuint64m4_t vindex64 = vundefined_u64m4();
947 vuint64m1_t vindex0 = vid_v_u64m1(2);
948 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
949 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
950 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
951 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
952 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
953 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
954 vint32m2_t v1 = vundefined_i32m2();
955 vint64m2_t v2 = vundefined_i64m2();
956 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
957 v1 = vrgather_vv_i32m2(v1, vindex, 8);
958 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
959 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
960}
961
962inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
963 const v_int64x2& c)
964{
965 vuint64m4_t vindex64 = vundefined_u64m4();
966 vuint64m1_t vindex0 = vid_v_u64m1(2);
967 vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
968 vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
969 vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
970 vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
971 vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
972 vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
973 vint32m2_t v1 = vundefined_i32m2();
974 vint64m2_t v2 = vundefined_i64m2();
975 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
976 v1 = vrgather_vv_i32m2(v1, vindex, 8);
977 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
978 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
979}
980
982// 16 >> 32
983inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
984{
985 vint32m2_t v1 = vundefined_i32m2();
986 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
987 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4));
988}
989
990inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
991{
992 vint32m2_t v1 = vundefined_i32m2();
993 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
994 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4), c.val, 4));
995}
996
997// 32 >> 64
998inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
999{
1000 vint64m2_t v1 = vundefined_i64m2();
1001 v1 = vwmul_vv_i64m2(a.val, b.val, 4);
1002 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2));
1003}
1004inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1005{
1006 vint64m2_t v1 = vundefined_i64m2();
1007 v1 = vwmul_vv_i64m2(a.val, b.val, 8);
1008 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 4), c.val, 4));
1009}
1010
1011// 8 >> 32
1012inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1013{
1014 vuint16m2_t v1 = vundefined_u16m2();
1015 vuint32m2_t v2 = vundefined_u32m2();
1016 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
1017 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
1018 return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
1019}
1020
1021inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1022{
1023 vuint16m2_t v1 = vundefined_u16m2();
1024 vuint32m2_t v2 = vundefined_u32m2();
1025 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
1026 v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
1027 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
1028}
1029
1030inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1031{
1032 vint16m2_t v1 = vundefined_i16m2();
1033 vint32m2_t v2 = vundefined_i32m2();
1034 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
1035 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
1036 return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
1037}
1038inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1039{
1040 vint16m2_t v1 = vundefined_i16m2();
1041 vint32m2_t v2 = vundefined_i32m2();
1042 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
1043 v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
1044 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
1045}
1046
1047// 16 >> 64
1048inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1049{
1050 vuint32m2_t v1 = vundefined_u32m2();
1051 vuint64m2_t v2 = vundefined_u64m2();
1052 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
1053 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
1054 return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
1055}
1056inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1057{
1058 vuint32m2_t v1 = vundefined_u32m2();
1059 vuint64m2_t v2 = vundefined_u64m2();
1060 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
1061 v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
1062 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
1063}
1064
1065inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1066{
1067 vint32m2_t v1 = vundefined_i32m2();
1068 vint64m2_t v2 = vundefined_i64m2();
1069 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
1070 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
1071 return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
1072}
1073inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1074{
1075 vint32m2_t v1 = vundefined_i32m2();
1076 vint64m2_t v2 = vundefined_i64m2();
1077 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
1078 v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
1079 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
1080}
1081
1082
1083#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
1084inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
1085{\
1086 v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
1087 val = intrin(val, a.val, val, num); \
1088 return vmv_x_s_##len##m1_##len(val); \
1089}
1090
1091
1092#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num, scalerfunc) \
1093inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
1094{\
1095 v##_Tpvec##m1_t val = vundefined_##_Tpvec2##m1(); \
1096 val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num); \
1097 return scalerfunc(val); \
1098}
1099OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
1100OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
1101OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64, int, sum, vwredsum_vs_i32m1_i64m1, 4)
1102OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16, unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16)
1103OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32, unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8)
1104OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4)
1105inline float v_reduce_sum(const v_float32x4& a) \
1106{\
1107 vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
1108 val = vfredosum_vs_f32m1_f32m1(val, a.val, val, 4); \
1109 return vfmv_f_s_f32m1_f32(val); \
1110}
1111inline double v_reduce_sum(const v_float64x2& a) \
1112{\
1113 vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
1114 val = vfredosum_vs_f64m1_f64m1(val, a.val, val, 2); \
1115 return vfmv_f_s_f64m1_f64(val); \
1116}
1117inline uint64 v_reduce_sum(const v_uint64x2& a)
1118{ vuint64m1_t res = vundefined_u64m1(); return vmv_x_s_u64m1_u64(vredsum_vs_u64m1_u64m1(res, a.val, vmv_v_x_u64m1(0, 2), 2)); }
1119
1120inline int64 v_reduce_sum(const v_int64x2& a)
1121{ vint64m1_t res = vundefined_i64m1(); return vmv_x_s_i64m1_i64(vredsum_vs_i64m1_i64m1(res, a.val, vmv_v_x_i64m1(0, 2), 2)); }
1122
1123#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func) \
1124OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16, vmv_x_s_i8m1_i8) \
1125OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8, vmv_x_s_i16m1_i16) \
1126OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4, vmv_x_s_i32m1_i32) \
1127OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2, vmv_x_s_i64m1_i64) \
1128OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16, vmv_x_s_u8m1_u8) \
1129OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8, vmv_x_s_u16m1_u16) \
1130OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4, vmv_x_s_u32m1_u32) \
1131OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4, vfmv_f_s_f32m1_f32)
1132OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
1133OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
1134
1135inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1136 const v_float32x4& c, const v_float32x4& d)
1137{
1138 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
1139 vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
1140 vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
1141 vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
1142 a0 = vfredosum_vs_f32m1_f32m1(a0, a.val, a0, 4);
1143 b0 = vfredosum_vs_f32m1_f32m1(b0, b.val, b0, 4);
1144 c0 = vfredosum_vs_f32m1_f32m1(c0, c.val, c0, 4);
1145 d0 = vfredosum_vs_f32m1_f32m1(d0, d.val, d0, 4);
1146 vfloat32m1_t res;
1147 res = vslideup_vx_f32m1(a0, b0, 1, 4);
1148 res = vslideup_vx_f32m1(res, c0, 2, 4);
1149 res = vslideup_vx_f32m1(res, d0, 3, 4);
1150 return v_float32x4(res);
1151}
1152
1153inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1154{
1155 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
1156 vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
1157 vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
1158 vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
1159 a0 = vfredosum_vs_f32m1_f32m1(a0, val, a0, 4);
1160 return vfmv_f_s_f32m1_f32(a0);
1161}
1162
1163#define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
1164inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){ \
1165 _Tpvec2 x = v_absdiff(a, b); \
1166 return v_reduce_sum(x); \
1167}
1168
1169OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int8x16, v_uint8x16)
1170OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint8x16, v_uint8x16)
1171OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int16x8, v_uint16x8)
1172OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint16x8, v_uint16x8)
1173OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
1174OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
1175
1176#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
1177inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1178{ \
1179 vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
1180 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1181} \
1182inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1183{ \
1184 vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
1185 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1186} \
1187inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1188{ \
1189 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
1190 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1191} \
1192inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1193{ \
1194 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
1195 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1196} \
1197inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1198{ \
1199 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
1200 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1201} \
1202inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1203{ \
1204 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
1205 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1206} \
1207
1208OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int8x16, i8m1, 8, 16, _vv_)
1209OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int16x8, i16m1, 16, 8, _vv_)
1210OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int32x4, i32m1, 32, 4, _vv_)
1211OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int64x2, i64m1, 64, 2, _vv_)
1212OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint8x16, u8m1, 8, 16, u_vv_)
1213OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint16x8, u16m1, 16, 8, u_vv_)
1214OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
1215OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
1216
1217//TODO: ==
1218inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
1219{
1220 vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
1221 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1222 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1223}
1224inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
1225{
1226 vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
1227 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1228 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1229}
1230inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
1231{
1232 vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
1233 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1234 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1235}
1236inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
1237{
1238 vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
1239 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1240 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1241}
1242inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
1243{
1244 vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
1245 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1246 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1247}
1248inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
1249{
1250 vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
1251 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1252 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1253}
1254inline v_float32x4 v_not_nan(const v_float32x4& a)
1255{
1256 vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, a.val, 4);
1257 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1258 return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1259}
1260
1261//TODO: ==
1262inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
1263{
1264 vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
1265 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1266 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1267}
1268inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
1269{
1270 vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
1271 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1272 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1273}
1274inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
1275{
1276 vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
1277 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1278 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1279}
1280inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
1281{
1282 vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
1283 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1284 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1285}
1286inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
1287{
1288 vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
1289 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1290 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1291}
1292inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
1293{
1294 vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
1295 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1296 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1297}
1298inline v_float64x2 v_not_nan(const v_float64x2& a)
1299{
1300 vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, a.val, 2);
1301 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1302 return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1303}
1304#define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
1305inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
1306 const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
1307 v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
1308 v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
1309{ \
1310 vuint32m4_t vindex = vundefined_u32m4(); \
1311 vuint32m1_t vindex0 = vid_v_u32m1(4); \
1312 vindex0 = vsll_vx_u32m1(vindex0, 2, 4); \
1313 vindex = vset_v_u32m1_u32m4(vindex, 0, vindex0); \
1314 vindex = vset_v_u32m1_u32m4(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); \
1315 vindex = vset_v_u32m1_u32m4(vindex, 2, vadd_vx_u32m1(vindex0, 2, 4)); \
1316 vindex = vset_v_u32m1_u32m4(vindex, 3, vadd_vx_u32m1(vindex0, 3, 4)); \
1317 v##_Tp##32m4_t val = vundefined_##_T##m4(); \
1318 val = vset_v_##_T##m1_##_T##m4(val, 0, a0.val); \
1319 val = vset_v_##_T##m1_##_T##m4(val, 1, a1.val); \
1320 val = vset_v_##_T##m1_##_T##m4(val, 2, a2.val); \
1321 val = vset_v_##_T##m1_##_T##m4(val, 3, a3.val); \
1322 val = vrgather_vv_##_T##m4(val, vindex, 16); \
1323 b0.val = vget_v_##_T##m4_##_T##m1(val, 0); \
1324 b1.val = vget_v_##_T##m4_##_T##m1(val, 1); \
1325 b2.val = vget_v_##_T##m4_##_T##m1(val, 2); \
1326 b3.val = vget_v_##_T##m4_##_T##m1(val, 3); \
1327}
1328OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
1329OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
1330OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
1331
1332
1333#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
1334inline _Tpvec operator << (const _Tpvec& a, int n) \
1335{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
1336template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1337{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
1338
1339#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
1340inline _Tpvec operator >> (const _Tpvec& a, int n) \
1341{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
1342template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1343{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
1344template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1345{ return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
1346
1347// trade efficiency for convenience
1348#define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
1349OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
1350OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
1351
1352OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
1353OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
1354OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
1355OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl)
1356OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
1357OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
1358OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
1359OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra)
1360
1361#if 0
1362#define VUP4(n) {0, 1, 2, 3}
1363#define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
1364#define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
1365#define VUP2(n) {0, 1}
1366#endif
1367#define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
1368template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1369{ \
1370 suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
1371 tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
1372 return _Tpvec(tmp);\
1373} \
1374template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1375{ \
1376 suffix##m1_t res = vundefined_##_T##m1(); \
1377 return _Tpvec(vslidedown_vx_##_T##m1(res, a.val, n, num));\
1378} \
1379template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1380{ return a; } \
1381template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1382{ \
1383 suffix##m2_t tmp = vundefined_##_T##m2(); \
1384 suffix##m2_t res = vundefined_##_T##m2(); \
1385 tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a.val); \
1386 tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, b.val); \
1387 res = vslidedown_vx_##_T##m2(res, tmp, n, num2);\
1388 return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 0));\
1389} \
1390template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1391{ \
1392 suffix##m2_t tmp = vundefined_##_T##m2(); \
1393 suffix##m2_t res = vundefined_##_T##m2(); \
1394 tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, b.val); \
1395 tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a.val); \
1396 res = vslideup_vx_##_T##m2(res, tmp, n, num2);\
1397 return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 1));\
1398} \
1399template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1400{ \
1401 CV_UNUSED(b); return a; \
1402}
1403
1404OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
1405OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
1406OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
1407OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
1408OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
1409OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
1410OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
1411OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
1412OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
1413OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
1414
1415#if 1
1416#define vreinterpret_v_i8m1_i8m1
1417#define vreinterpret_v_u8m1_u8m1
1418#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize, ldst_len, ldst_type) \
1419inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1420{ \
1421 _Tp2##_t res = vundefined_##len(); \
1422 _Tp2##_t res1 = vundefined_##len(); \
1423 res = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr0, 8)); \
1424 res1 = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr1, 8)); \
1425 res = vslideup_vx_##len(res, res1, hnum, num); \
1426 return _Tpvec(res); } \
1427inline _Tpvec v_load_low(const _Tp* ptr) \
1428{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 8))); }\
1429inline _Tpvec v_load_aligned(const _Tp* ptr) \
1430{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
1431inline _Tpvec v_load(const _Tp* ptr) \
1432{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1433inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1434{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
1435inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1436{ \
1437 _Tp2##_t a0 = vundefined_##len(); \
1438 a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1439 vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
1440inline void v_store(_Tp* ptr, const _Tpvec& a) \
1441{ vse##elemsize##_v_##len(ptr, a.val, num); } \
1442inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1443{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
1444inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1445{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
1446inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1447{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); }
1448
1449OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8, u8m1, uchar)
1450OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16, schar, vint8m1, i8m1, 8, 16, 8, i8m1, schar)
1451OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16, u8m1, uchar)
1452OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8, short, vint16m1, i16m1, 4, 8, 16, i8m1, schar)
1453OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32, u8m1, uchar)
1454OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4, int, vint32m1, i32m1, 2, 4, 32, i8m1, schar)
1455OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64, u8m1, uchar)
1456OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2, long, vint64m1, i64m1, 1, 2, 64, i8m1, schar)
1457
1458#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
1459inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1460{ \
1461 _Tp2##_t res = vundefined_##len(); \
1462 _Tp2##_t res1 = vundefined_##len(); \
1463 res = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr0, 8))); \
1464 res1 = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr1, 8))); \
1465 res = vslideup_vx_##len(res, res1, hnum, num); \
1466 return _Tpvec(res); } \
1467inline _Tpvec v_load_low(const _Tp* ptr) \
1468{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 8)))); }\
1469inline _Tpvec v_load_aligned(const _Tp* ptr) \
1470{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
1471inline _Tpvec v_load(const _Tp* ptr) \
1472{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1473inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1474{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
1475inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1476{ \
1477 _Tp2##_t a0 = vundefined_##len(); \
1478 a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1479 vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
1480inline void v_store(_Tp* ptr, const _Tpvec& a) \
1481{ vse##elemsize##_v_##len(ptr, a.val, num); } \
1482inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1483{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
1484inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1485{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
1486inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1487{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); }
1488OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
1489OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
1490
1491#else
1492
1493#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
1494inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1495{ \
1496 _Tp2##_t res, res1; \
1497 res = vle##elemsize##_v_##len(ptr0, hnum); \
1498 res1 = vle##elemsize##_v_##len(ptr1, hnum); \
1499 res = vslideup_vx_##len(res, res1, hnum, num); \
1500 return _Tpvec(res); } \
1501inline _Tpvec v_load_low(const _Tp* ptr) \
1502{ return _Tpvec(vle##elemsize##_v_##len(ptr, hnum)); }\
1503inline _Tpvec v_load_aligned(const _Tp* ptr) \
1504{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1505inline _Tpvec v_load(const _Tp* ptr) \
1506{ return _Tpvec((_Tp2##_t)vle##elemsize##_v_##len((const _Tp *)ptr, num)); } \
1507inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1508{ vse##elemsize##_v_##len(ptr, a.val, hnum);}\
1509inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1510{ \
1511 _Tp2##_t a0; \
1512 a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1513 vse##elemsize##_v_##len(ptr, a0, hnum);}\
1514inline void v_store(_Tp* ptr, const _Tpvec& a) \
1515{ vse##elemsize##_v_##len(ptr, a.val, num); } \
1516inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1517{ vse##elemsize##_v_##len(ptr, a.val, num); } \
1518inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1519{ vse##elemsize##_v_##len(ptr, a.val, num); } \
1520inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1521{ vse##elemsize##_v_##len(ptr, a.val, num); }
1522
1523OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8)
1524OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16, schar, vint8m1, i8m1, 8, 16, 8)
1525OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16)
1526OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8, short, vint16m1, i16m1, 4, 8, 16)
1527OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32)
1528OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4, int, vint32m1, i32m1, 2, 4, 32)
1529OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64)
1530OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2, long, vint64m1, i64m1, 1, 2, 64)
1531OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
1532OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
1533
1534#endif
1535
1537
1538inline v_int8x16 v_lut(const schar* tab, const int* idx)
1539{
1540#if 0
1541 schar CV_DECL_ALIGNED(32) elems[16] =
1542 {
1543 tab[idx[ 0]],
1544 tab[idx[ 1]],
1545 tab[idx[ 2]],
1546 tab[idx[ 3]],
1547 tab[idx[ 4]],
1548 tab[idx[ 5]],
1549 tab[idx[ 6]],
1550 tab[idx[ 7]],
1551 tab[idx[ 8]],
1552 tab[idx[ 9]],
1553 tab[idx[10]],
1554 tab[idx[11]],
1555 tab[idx[12]],
1556 tab[idx[13]],
1557 tab[idx[14]],
1558 tab[idx[15]]
1559 };
1560 return v_int8x16(vle8_v_i8m1(elems, 16));
1561#else
1562#if __riscv_v == 7000
1563 return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, vle32_v_u32m4((unsigned int *)idx, 16), 16), 0, 16), 0, 16));
1564#else
1565 return v_int8x16(vloxei32_v_i8m1(tab, vle32_v_u32m4((unsigned int *)idx, 16), 16));
1566#endif
1567#endif
1568}
1569
1570inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
1571#if 0
1572 schar CV_DECL_ALIGNED(32) elems[16] =
1573 {
1574 tab[idx[0]],
1575 tab[idx[0] + 1],
1576 tab[idx[1]],
1577 tab[idx[1] + 1],
1578 tab[idx[2]],
1579 tab[idx[2] + 1],
1580 tab[idx[3]],
1581 tab[idx[3] + 1],
1582 tab[idx[4]],
1583 tab[idx[4] + 1],
1584 tab[idx[5]],
1585 tab[idx[5] + 1],
1586 tab[idx[6]],
1587 tab[idx[6] + 1],
1588 tab[idx[7]],
1589 tab[idx[7] + 1]
1590 };
1591 return v_int8x16(vle8_v_i8m1(elems, 16));
1592#else
1593 vuint32m4_t seq, index;
1594 vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 8);
1595 seq = vid_v_u32m4(16);
1596 index = vsrl_vx_u32m4(seq, 1, 16);
1597 vidx = vrgather_vv_u32m4(vidx, index, 16);
1598 index = vadd_vv_u32m4(vand_vx_u32m4(seq, 1, 16), vidx, 16);
1599#if __riscv_v == 7000
1600 return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
1601#else
1602 return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
1603#endif
1604#endif
1605}
1606inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1607{
1608#if 0
1609 schar CV_DECL_ALIGNED(32) elems[16] =
1610 {
1611 tab[idx[0]],
1612 tab[idx[0] + 1],
1613 tab[idx[0] + 2],
1614 tab[idx[0] + 3],
1615 tab[idx[1]],
1616 tab[idx[1] + 1],
1617 tab[idx[1] + 2],
1618 tab[idx[1] + 3],
1619 tab[idx[2]],
1620 tab[idx[2] + 1],
1621 tab[idx[2] + 2],
1622 tab[idx[2] + 3],
1623 tab[idx[3]],
1624 tab[idx[3] + 1],
1625 tab[idx[3] + 2],
1626 tab[idx[3] + 3]
1627 };
1628 return v_int8x16(vle8_v_i8m1(elems, 16));
1629#else
1630 vuint32m4_t seq, index;
1631 vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 4);
1632 seq = vid_v_u32m4(16);
1633 index = vsrl_vx_u32m4(seq, 2, 16);
1634 vidx = vrgather_vv_u32m4(vidx, index, 16);
1635 seq = vset_v_u32m1_u32m4(seq, 1, vget_v_u32m4_u32m1(seq, 0));
1636 seq = vset_v_u32m1_u32m4(seq, 2, vget_v_u32m4_u32m1(seq, 0));
1637 seq = vset_v_u32m1_u32m4(seq, 3, vget_v_u32m4_u32m1(seq, 0));
1638 index = vadd_vv_u32m4(seq, vidx, 16);
1639#if __riscv_v == 7000
1640 return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
1641#else
1642 return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
1643#endif
1644#endif
1645}
1646
1647inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1648inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1649inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1650
1651inline v_int16x8 v_lut(const short* tab, const int* idx)
1652{
1653#if 0
1654 short CV_DECL_ALIGNED(32) elems[8] =
1655 {
1656 tab[idx[0]],
1657 tab[idx[1]],
1658 tab[idx[2]],
1659 tab[idx[3]],
1660 tab[idx[4]],
1661 tab[idx[5]],
1662 tab[idx[6]],
1663 tab[idx[7]]
1664 };
1665 return v_int16x8(vle16_v_i16m1(elems, 8));
1666#else
1667#if __riscv_v == 7000
1668 return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8), 0, 8));
1669#else
1670 return v_int16x8(vloxei32_v_i16m1(tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8));
1671#endif
1672#endif
1673}
1674inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1675{
1676#if 0
1677 short CV_DECL_ALIGNED(32) elems[8] =
1678 {
1679 tab[idx[0]],
1680 tab[idx[0] + 1],
1681 tab[idx[1]],
1682 tab[idx[1] + 1],
1683 tab[idx[2]],
1684 tab[idx[2] + 1],
1685 tab[idx[3]],
1686 tab[idx[3] + 1]
1687 };
1688 return v_int16x8(vle16_v_i16m1(elems, 8));
1689#else
1690 vuint32m2_t seq, index;
1691 vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 4);
1692 seq = vid_v_u32m2(8);
1693 index = vsrl_vx_u32m2(seq, 1, 8);
1694 vidx = vrgather_vv_u32m2(vidx, index, 8);
1695 index = vsll_vx_u32m2(vadd_vv_u32m2(vand_vx_u32m2(seq, 1, 8), vidx, 8), 1, 8);
1696#if __riscv_v == 7000
1697 return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
1698#else
1699 return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
1700#endif
1701#endif
1702}
1703inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1704{
1705#if 0
1706 short CV_DECL_ALIGNED(32) elems[8] =
1707 {
1708 tab[idx[0]],
1709 tab[idx[0] + 1],
1710 tab[idx[0] + 2],
1711 tab[idx[0] + 3],
1712 tab[idx[1]],
1713 tab[idx[1] + 1],
1714 tab[idx[1] + 2],
1715 tab[idx[1] + 3]
1716 };
1717 return v_int16x8(vle16_v_i16m1(elems, 8));
1718#else
1719 vuint32m2_t seq, index;
1720 vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 2);
1721 seq = vid_v_u32m2(8);
1722 index = vsrl_vx_u32m2(seq, 2, 8);
1723 vidx = vrgather_vv_u32m2(vidx, index, 8);
1724 seq = vset_v_u32m1_u32m2(seq, 1, vget_v_u32m2_u32m1(seq, 0));
1725 index = vsll_vx_u32m2(vadd_vv_u32m2(seq, vidx, 8), 1, 8);
1726#if __riscv_v == 7000
1727 return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
1728#else
1729 return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
1730#endif
1731#endif
1732}
1733inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
1734inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
1735inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
1736
1737inline v_int32x4 v_lut(const int* tab, const int* idx)
1738{
1739#if 0
1740 int CV_DECL_ALIGNED(32) elems[4] =
1741 {
1742 tab[idx[0]],
1743 tab[idx[1]],
1744 tab[idx[2]],
1745 tab[idx[3]]
1746 };
1747 return v_int32x4(vle32_v_i32m1(elems, 4));
1748#else
1749 return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
1750#endif
1751}
1752inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1753{
1754#if 0
1755 int CV_DECL_ALIGNED(32) elems[4] =
1756 {
1757 tab[idx[0]],
1758 tab[idx[0] + 1],
1759 tab[idx[1]],
1760 tab[idx[1] + 1]
1761 };
1762 return v_int32x4(vle32_v_i32m1(elems, 4));
1763#else
1764 vuint32m1_t seq, index;
1765 vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
1766 seq = vid_v_u32m1(4);
1767 index = vsrl_vx_u32m1(seq, 1, 4);
1768 vidx = vrgather_vv_u32m1(vidx, index, 4);
1769 index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
1770 return v_int32x4(vloxei32_v_i32m1(tab, index, 4));
1771#endif
1772}
1773inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1774{
1775 return v_int32x4(vle32_v_i32m1(tab+idx[0], 4));
1776}
1777inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
1778inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
1779inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1780
1781inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1782{
1783 //vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1784 return v_int64x2(vloxei64_v_i64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
1785}
1786inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1787{
1788 return v_int64x2(vle64_v_i64m1(tab+idx[0], 2));
1789}
1790
1791inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
1792{
1793 //vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1794 return v_uint64x2(vloxei64_v_u64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
1795}
1796inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
1797{
1798 return v_uint64x2(vle64_v_u64m1(tab+idx[0], 2));
1799}
1800
1801inline v_float32x4 v_lut(const float* tab, const int* idx)
1802{
1803#if 0
1804 float CV_DECL_ALIGNED(32) elems[4] =
1805 {
1806 tab[idx[0]],
1807 tab[idx[1]],
1808 tab[idx[2]],
1809 tab[idx[3]]
1810 };
1811 return v_float32x4(vle32_v_f32m1(elems, 4));
1812#else
1813 return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
1814#endif
1815}
1816inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1817{
1818#if 0
1819 float CV_DECL_ALIGNED(32) elems[4] =
1820 {
1821 tab[idx[0]],
1822 tab[idx[0]+1],
1823 tab[idx[1]],
1824 tab[idx[1]+1]
1825 };
1826 return v_float32x4(vle32_v_f32m1(elems, 4));
1827#else
1828 vuint32m1_t seq, index;
1829 vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
1830 seq = vid_v_u32m1(4);
1831 index = vsrl_vx_u32m1(seq, 1, 4);
1832 vidx = vrgather_vv_u32m1(vidx, index, 4);
1833 index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
1834 return v_float32x4(vloxei32_v_f32m1(tab, index, 4));
1835#endif
1836}
1837inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1838{
1839 return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
1840}
1841inline v_float64x2 v_lut(const double* tab, const int* idx)
1842{
1843 //vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
1844 return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
1845}
1846inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1847{
1848 return v_float64x2(vle64_v_f64m1(tab+idx[0], 2));
1849}
1850
1851inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1852{
1853 /*int CV_DECL_ALIGNED(32) elems[4] =
1854 {
1855 tab[idxvec.val[0]],
1856 tab[idxvec.val[1]],
1857 tab[idxvec.val[2]],
1858 tab[idxvec.val[3]]
1859 };*/
1860 return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1861}
1862
1863inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1864{
1865 /*unsigned CV_DECL_ALIGNED(32) elems[4] =
1866 {
1867 tab[idxvec.val[0]],
1868 tab[idxvec.val[1]],
1869 tab[idxvec.val[2]],
1870 tab[idxvec.val[3]]
1871 };*/
1872 return v_uint32x4(vloxei32_v_u32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1873}
1874
1875inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1876{
1877 /*float CV_DECL_ALIGNED(32) elems[4] =
1878 {
1879 tab[idxvec.val[0]],
1880 tab[idxvec.val[1]],
1881 tab[idxvec.val[2]],
1882 tab[idxvec.val[3]]
1883 };*/
1884 return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1885}
1886inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1887{
1888 //vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
1889 return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vreinterpret_v_i64m1_u64m1(vget_v_i64m2_i64m1(vwadd_vx_i64m2(idxvec.val, 0, 2), 0)), 3, 2), 2));
1890}
1891inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1892{
1893 vint32m1_t index = vmul_vx_i32m1(idxvec.val, 4, 4);
1894 //vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
1895
1896 //x.val = vlxe_v_f32m1(tab, index_x, 4);
1897 //y.val = vlxe_v_f32m1(tab, index_y, 4);
1898 vloxseg2ei32_v_f32m1(&x.val, &y.val, tab, vreinterpret_v_i32m1_u32m1(index), 4);
1899}
1900
1901inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1902{
1903 int CV_DECL_ALIGNED(32) idx[4];
1904 v_store_aligned(idx, idxvec);
1905
1906 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1907 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1908}
1909
1910#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type, elemsize) \
1911inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1912{ \
1913 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1914 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1915 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val); \
1916 return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
1917}\
1918template<int n> inline \
1919v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1920{ \
1921 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1922 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1923 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val); \
1924 return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
1925}\
1926inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1927{ \
1928 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1929 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1930 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1931 asm("" ::: "memory"); \
1932 vse##elemsize##_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
1933}\
1934template<int n> inline \
1935void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1936{ \
1937 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1938 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1939 tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1940 vse##elemsize##_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
1941}
1942OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_wx, vnclip_wx, signed char, 8)
1943OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_wx, vnclip_wx, signed short, 16)
1944OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_wx, vnsra_wx, int, 32)
1945OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_wx, vnclipu_wx, unsigned char, 8)
1946OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_wx, vnclipu_wx, unsigned short, 16)
1947OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_wx, vnsrl_wx, unsigned int, 32)
1948
1949// pack boolean
1950inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1951{
1952 vuint16m2_t tmp = vundefined_u16m2(); \
1953 tmp = vset_v_u16m1_u16m2(tmp, 0, a.val); \
1954 tmp = vset_v_u16m1_u16m2(tmp, 1, b.val); \
1955 return v_uint8x16(vnsrl_wx_u8m1(tmp, 0, 16));
1956}
1957
1958inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1959 const v_uint32x4& c, const v_uint32x4& d)
1960{
1961 vuint32m4_t vabcd = vundefined_u32m4(); \
1962 vuint16m2_t v16 = vundefined_u16m2(); \
1963 vabcd = vset_v_u32m1_u32m4(vabcd, 0, a.val); \
1964 vabcd = vset_v_u32m1_u32m4(vabcd, 1, b.val); \
1965 vabcd = vset_v_u32m1_u32m4(vabcd, 2, c.val); \
1966 vabcd = vset_v_u32m1_u32m4(vabcd, 3, d.val); \
1967 v16 = vnsrl_wx_u16m2(vabcd, 0, 16);
1968 return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
1969}
1970
1971inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
1972 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
1973 const v_uint64x2& g, const v_uint64x2& h)
1974{
1975 vuint64m8_t v64 = vundefined_u64m8(); \
1976 vuint32m4_t v32 = vundefined_u32m4(); \
1977 vuint16m2_t v16 = vundefined_u16m2(); \
1978 v64 = vset_v_u64m1_u64m8(v64, 0, a.val); \
1979 v64 = vset_v_u64m1_u64m8(v64, 1, b.val); \
1980 v64 = vset_v_u64m1_u64m8(v64, 2, c.val); \
1981 v64 = vset_v_u64m1_u64m8(v64, 3, d.val); \
1982 v64 = vset_v_u64m1_u64m8(v64, 4, e.val); \
1983 v64 = vset_v_u64m1_u64m8(v64, 5, f.val); \
1984 v64 = vset_v_u64m1_u64m8(v64, 6, g.val); \
1985 v64 = vset_v_u64m1_u64m8(v64, 7, h.val); \
1986 v32 = vnsrl_wx_u32m4(v64, 0, 16);
1987 v16 = vnsrl_wx_u16m2(v32, 0, 16);
1988 return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
1989}
1990
1991//inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
1992//{ \
1993// int16xm2_u tmp; \
1994// tmp.m1[0] = (vint16m1_t)a.val; \
1995// tmp.m1[1] = (vint16m1_t)b.val; \
1996// e8xm1_t mask = (e8xm1_t)vmsge_vx_e16xm2_i16m2(tmp.v, 0, 16);\
1997// return v_uint8x16(vnclipuvi_mask_u8m1_u16m2(vmv_v_x_u8m1(0, 16), (vuint16m2_t)tmp.v, 0, mask, 16));
1998//}
1999
2000#define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
2001inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
2002{ \
2003 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2004 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2005 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val); \
2006 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2007 return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1)); \
2008} \
2009inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
2010{ \
2011 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2012 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2013 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2014 return vse##tp1##_v_u##tp1##m1(ptr, vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1), num2); \
2015} \
2016template<int n> inline \
2017v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
2018{ \
2019 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2020 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2021 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val); \
2022 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2023 return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), n, num1)); \
2024} \
2025template<int n> inline \
2026void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
2027{ \
2028 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2029 tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2030 vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2031 vuint##tp1##m1_t val = vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val_), n, num1); \
2032 return vse##tp1##_v_u##tp1##m1(ptr, val, num2);\
2033}
2034OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
2035OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
2036
2037
2038// saturating multiply 8-bit, 16-bit
2039#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt) \
2040 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
2041 { \
2042 auto res = mul(a.val, b.val, num); \
2043 return _Tpvec(cvt(res, 0, num)); \
2044 } \
2045 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
2046 { a = a * b; return a; }
2047
2048OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, 16, vwmul_vv_i16m2, vnclip_wx_i8m1)
2049OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
2050OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8, 32, vwmul_vv_i32m2, vnclip_wx_i16m1)
2051OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, 32, vwmulu_vv_u32m2, vnclipu_wx_u16m1)
2052
2053
2054static const signed char popCountTable[256] =
2055{
2056 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2057 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2058 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2059 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2060 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2061 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2062 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2063 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2064 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2065 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2066 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2067 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2068 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2069 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2070 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2071 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2072};
2073
2074inline vuint8m1_t vcnt_u8(vuint8m1_t val){
2075#if __riscv_v == 7000
2076 vuint8m1_t v0 = vand_vx_u8m1(val, 1, 16);
2077 return vadd_vv_u8m1(vloxei8_v_u8m1((unsigned char*)popCountTable, vsrl_vx_u8m1(val, 1, 16), 16), v0, 16);
2078#else
2079 return vloxei8_v_u8m1((unsigned char*)popCountTable, val, 16);
2080#endif
2081}
2082
2083inline v_uint8x16
2084v_popcount(const v_uint8x16& a)
2085{
2086 return v_uint8x16(vcnt_u8(a.val));
2087}
2088
2089inline v_uint8x16
2090v_popcount(const v_int8x16& a)
2091{
2092 return v_uint8x16(vcnt_u8(vreinterpret_v_i8m1_u8m1(a.val)));
2093}
2094
2095inline v_uint16x8
2096v_popcount(const v_uint16x8& a)
2097{
2098 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u16m1_u8m1(a.val));
2099 vuint8m1_t seq = vid_v_u8m1(8);
2100 vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
2101 return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
2102}
2103
2104inline v_uint16x8
2105v_popcount(const v_int16x8& a)
2106{
2107 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(a.val)));
2108 vuint8m1_t seq = vid_v_u8m1(8);
2109 vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
2110 return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
2111}
2112
2113inline v_uint32x4
2114v_popcount(const v_uint32x4& a)
2115{
2116 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u32m1_u8m1(a.val));
2117 vuint8m1_t seq = vid_v_u8m1(8);
2118 vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
2119 vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
2120 return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
2121}
2122
2123inline v_uint32x4
2124v_popcount(const v_int32x4& a)
2125{
2126 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(a.val)));
2127 vuint8m1_t seq = vid_v_u8m1(8);
2128 vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
2129 vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
2130 return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
2131}
2132
2133inline v_uint64x2
2134v_popcount(const v_uint64x2& a)
2135{
2136 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u64m1_u8m1(a.val));
2137 vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
2138 vuint16m1_t res1 = vundefined_u16m1();
2139 vuint16m1_t res2 = vundefined_u16m1();
2140 res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
2141 res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
2142 return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
2143}
2144
2145inline v_uint64x2
2146v_popcount(const v_int64x2& a)
2147{
2148 vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(a.val)));
2149 vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
2150 vuint16m1_t res1 = vundefined_u16m1(), res2 = vundefined_u16m1();
2151 res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
2152 res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
2153 return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
2154}
2155
2156#define SMASK 1, 2, 4, 8, 16, 32, 64, 128
2157inline int v_signmask(const v_uint8x16& a)
2158{
2159 vuint16m1_t res = vundefined_u16m1();
2160 vuint8m1_t id = vid_v_u8m1(16);
2161 vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
2162 vuint8m1_t t0 = vsrl_vx_u8m1(a.val, 7, 16);
2163 vbool8_t mask = vmseq_vx_u8m1_b8(t0, 1, 16);
2164 res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2165 return vmv_x_s_u16m1_u16(res);
2166}
2167inline int v_signmask(const v_int8x16& a)
2168{
2169 vuint16m1_t res = vundefined_u16m1();
2170 vuint8m1_t id = vid_v_u8m1(16);
2171 vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
2172 vbool8_t mask = vmslt_vx_i8m1_b8(a.val, 0, 16);
2173 res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2174 return vmv_x_s_u16m1_u16(res);
2175}
2176
2177inline int v_signmask(const v_int16x8& a)
2178{
2179 vuint16m1_t res = vundefined_u16m1();
2180 vuint16m1_t id = vid_v_u16m1(8);
2181 vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
2182 vbool16_t mask = vmslt_vx_i16m1_b16(a.val, 0, 8);
2183 res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2184 return vmv_x_s_u16m1_u16(res);
2185}
2186inline int v_signmask(const v_uint16x8& a)
2187{
2188 vuint16m1_t res = vundefined_u16m1();
2189 vuint16m1_t id = vid_v_u16m1(8);
2190 vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
2191 vuint16m1_t t0 = vsrl_vx_u16m1(a.val, 15, 8);
2192 vbool16_t mask = vmseq_vx_u16m1_b16(t0, 1, 8);
2193 res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 8);
2194 return vmv_x_s_u16m1_u16(res);
2195}
2196inline int v_signmask(const v_int32x4& a)
2197{
2198 vuint32m1_t res = vundefined_u32m1();
2199 vuint32m1_t id = vid_v_u32m1(4);
2200 vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
2201 vbool32_t mask = vmslt_vx_i32m1_b32(a.val, 0, 4);
2202 res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
2203 return vmv_x_s_u32m1_u32(res);
2204}
2205inline int v_signmask(const v_uint32x4& a)
2206{
2207 vuint32m1_t res = vundefined_u32m1();
2208 vuint32m1_t id = vid_v_u32m1(4);
2209 vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
2210 vuint32m1_t t0 = vsrl_vx_u32m1(a.val, 31, 4);
2211 vbool32_t mask = vmseq_vx_u32m1_b32(t0, 1, 4);
2212 res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
2213 return vmv_x_s_u32m1_u32(res);
2214}
2215inline int v_signmask(const v_uint64x2& a)
2216{
2217 vuint64m1_t res = vundefined_u64m1();
2218 vuint64m1_t id = vid_v_u64m1(2);
2219 vuint64m1_t num = vsll_vv_u64m1(vmv_v_x_u64m1(1, 2), id, 2);
2220 vuint64m1_t t0 = vsrl_vx_u64m1(a.val, 63, 2);
2221 vbool64_t mask = vmseq_vx_u64m1_b64(t0, 1, 2);
2222 res = vredsum_vs_u64m1_u64m1_m(mask, res, num, vmv_v_x_u64m1(0, 2), 2);
2223 return vmv_x_s_u64m1_u64(res);
2224}
2225inline int v_signmask(const v_int64x2& a)
2226{ return v_signmask(v_reinterpret_as_u64(a)); }
2227inline int v_signmask(const v_float64x2& a)
2228{ return v_signmask(v_reinterpret_as_u64(a)); }
2229inline int v_signmask(const v_float32x4& a)
2230{
2231 return v_signmask(v_reinterpret_as_u32(a));
2232 /*
2233 vuint32m1_t res;
2234 vuint32m1_t id = vid_v_u32m1(4);
2235 vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
2236 vbool32_t mask = vmflt_vf_f32m1_b32(a.val, 0, 4);
2237 res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
2238 return vmv_x_s_u32m1_u32(res);*/
2239}
2240
2241inline int v_scan_forward(const v_int8x16& a) {
2242int val = v_signmask(a);
2243if(val==0) return 0;
2244else return trailingZeros32(val); }
2245inline int v_scan_forward(const v_uint8x16& a) {
2246int val = v_signmask(a);
2247if(val==0) return 0;
2248else return trailingZeros32(val); }
2249inline int v_scan_forward(const v_int16x8& a) {
2250int val = v_signmask(a);
2251if(val==0) return 0;
2252else return trailingZeros32(val); }
2253inline int v_scan_forward(const v_uint16x8& a) {
2254int val = v_signmask(a);
2255if(val==0) return 0;
2256else return trailingZeros32(val); }
2257inline int v_scan_forward(const v_int32x4& a) {
2258int val = v_signmask(a);
2259if(val==0) return 0;
2260else return trailingZeros32(val); }
2261inline int v_scan_forward(const v_uint32x4& a) {
2262int val = v_signmask(a);
2263if(val==0) return 0;
2264else return trailingZeros32(val); }
2265inline int v_scan_forward(const v_float32x4& a) {
2266int val = v_signmask(a);
2267if(val==0) return 0;
2268else return trailingZeros32(val); }
2269inline int v_scan_forward(const v_int64x2& a) {
2270int val = v_signmask(a);
2271if(val==0) return 0;
2272else return trailingZeros32(val); }
2273inline int v_scan_forward(const v_uint64x2& a) {
2274int val = v_signmask(a);
2275if(val==0) return 0;
2276else return trailingZeros32(val); }
2277
2278#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num, mask_b) \
2279inline bool v_check_all(const v_##_Tpvec& a) \
2280{ \
2281 suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
2282 return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) == 0; \
2283} \
2284inline bool v_check_any(const v_##_Tpvec& a) \
2285{ \
2286 suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
2287 return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) != 0; \
2288}
2289
2290OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16, b8)
2291OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8, b16)
2292OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4, b32)
2293OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2, b64)
2294
2295inline bool v_check_all(const v_int8x16& a)
2296{ return v_check_all(v_reinterpret_as_u8(a)); }
2297inline bool v_check_all(const v_int16x8& a)
2298{ return v_check_all(v_reinterpret_as_u16(a)); }
2299inline bool v_check_all(const v_int32x4& a)
2300{ return v_check_all(v_reinterpret_as_u32(a)); }
2301inline bool v_check_all(const v_float32x4& a)
2302{ return v_check_all(v_reinterpret_as_u32(a)); }
2303inline bool v_check_all(const v_int64x2& a)
2304{ return v_check_all(v_reinterpret_as_u64(a)); }
2305inline bool v_check_all(const v_float64x2& a)
2306{ return v_check_all(v_reinterpret_as_u64(a)); }
2307
2308inline bool v_check_any(const v_int8x16& a)
2309{ return v_check_any(v_reinterpret_as_u8(a)); }
2310inline bool v_check_any(const v_int16x8& a)
2311{ return v_check_any(v_reinterpret_as_u16(a)); }
2312inline bool v_check_any(const v_int32x4& a)
2313{ return v_check_any(v_reinterpret_as_u32(a)); }
2314inline bool v_check_any(const v_float32x4& a)
2315{ return v_check_any(v_reinterpret_as_u32(a)); }
2316inline bool v_check_any(const v_int64x2& a)
2317{ return v_check_any(v_reinterpret_as_u64(a)); }
2318inline bool v_check_any(const v_float64x2& a)
2319{ return v_check_any(v_reinterpret_as_u64(a)); }
2320
2321#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num, mask_func) \
2322inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
2323{ \
2324 return _Tpvec(vmerge_vvm_##suffix(mask_func(mask.val, 0, num), b.val, a.val, num)); \
2325}
2326
2327OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16, i8m1, vbool8_t, 16, vmsne_vx_i8m1_b8)
2328OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8, i16m1, vbool16_t, 8, vmsne_vx_i16m1_b16)
2329OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4, i32m1, vbool32_t, 4, vmsne_vx_i32m1_b32)
2330OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16, vmsne_vx_u8m1_b8)
2331OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8, vmsne_vx_u16m1_b16)
2332OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4, vmsne_vx_u32m1_b32)
2333inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
2334{
2335 return v_float32x4(vmerge_vvm_f32m1(vmfne_vf_f32m1_b32(mask.val, 0, 4), b.val, a.val, 4));
2336}
2337inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
2338{
2339 return v_float64x2(vmerge_vvm_f64m1(vmfne_vf_f64m1_b64(mask.val, 0, 2), b.val, a.val, 2));
2340}
2341
2342#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2, num3) \
2343inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
2344{ \
2345 _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1); \
2346 b0.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 0); \
2347 b1.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 1); \
2348} \
2349inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
2350{ \
2351 _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num2); \
2352 return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
2353} \
2354inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
2355{ \
2356 _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1); \
2357 return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 1)); \
2358} \
2359inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
2360{ \
2361 _T2##_t val = vle##num3##_v_##_Tp1(ptr, num2); \
2362 _T1##_t b = vw##add##_vx_##_Tp2##m2(val, 0, num2); \
2363 return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
2364}
2365
2366OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1, 8)
2367OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1, 16)
2368OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1, 32)
2369OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar, i8m1, 16, i16, 8, vint16m2, vint8m1, 8)
2370OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short, i16m1, 8, i32, 4, vint32m2, vint16m1, 16)
2371OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int, i32m1, 4, i64, 2, vint64m2, vint32m1, 32)
2372
2373inline v_uint32x4 v_load_expand_q(const uchar* ptr)
2374{
2375 vuint16m2_t b = vundefined_u16m2();
2376 vuint32m2_t c = vundefined_u32m2();
2377 vuint8m1_t val = vle8_v_u8m1(ptr, 4); \
2378 b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4); \
2379 c = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \
2380 return v_uint32x4(vget_v_u32m2_u32m1(c, 0));
2381}
2382
2383inline v_int32x4 v_load_expand_q(const schar* ptr)
2384{
2385 vint16m2_t b = vundefined_i16m2();
2386 vint32m2_t c = vundefined_i32m2();
2387 vint8m1_t val = vle8_v_i8m1(ptr, 4); \
2388 b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4); \
2389 c = vwadd_vv_i32m2(vget_v_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \
2390 return v_int32x4(vget_v_i32m2_i32m1(c, 0));
2391}
2392#define VITL_16 {0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
2393#define VITL_8 {0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
2394#define VITL_4 {0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
2395#define VITL_2 {0, 0, 2, 0, 1, 0, 3, 0}
2396
2397#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh, refunc) \
2398inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2399{ \
2400 v##_Tp##m2_t tmp = vundefined_##_T##m2();\
2401 tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a0.val); \
2402 tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a1.val); \
2403 unsigned mdata[] = VITL_##num; \
2404 vuint32m2_t mask = vle32_v_u32m2(mdata, 8); \
2405 tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, refunc(mask), num2); \
2406 b0.val = vget_v_##_T##m2_##_T##m1(tmp, 0); \
2407 b1.val = vget_v_##_T##m2_##_T##m1(tmp, 1); \
2408} \
2409inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2410{ \
2411 v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2412 return v_##_Tpvec(b0);\
2413} \
2414inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2415{ \
2416 v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
2417 v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
2418 v##_Tp##m1_t b1 = vundefined_##_T##m1(); \
2419 b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num); \
2420 a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num); \
2421 b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2422 return v_##_Tpvec(b1);\
2423} \
2424inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2425{ \
2426 v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
2427 v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
2428 c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2429 b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num); \
2430 a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num); \
2431 d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2432}
2433
2434OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
2435OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
2436OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
2437OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
2438OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2,)
2439OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2,)
2440OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2,)
2441OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1, vreinterpret_v_u32m2_u64m2)
2442
2443inline v_uint8x16 v_reverse(const v_uint8x16 &a)
2444{
2445 return v_uint8x16(vrgather_vv_u8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
2446}
2447inline v_int8x16 v_reverse(const v_int8x16 &a)
2448{
2449 return v_int8x16(vrgather_vv_i8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
2450}
2451
2452inline v_uint16x8 v_reverse(const v_uint16x8 &a)
2453{
2454 return v_uint16x8(vrgather_vv_u16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
2455}
2456
2457inline v_int16x8 v_reverse(const v_int16x8 &a)
2458{
2459 return v_int16x8(vrgather_vv_i16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
2460}
2461inline v_uint32x4 v_reverse(const v_uint32x4 &a)
2462{
2463 return v_uint32x4(vrgather_vv_u32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
2464}
2465
2466inline v_int32x4 v_reverse(const v_int32x4 &a)
2467{
2468 return v_int32x4(vrgather_vv_i32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
2469}
2470
2471inline v_float32x4 v_reverse(const v_float32x4 &a)
2472{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
2473
2474inline v_uint64x2 v_reverse(const v_uint64x2 &a)
2475{
2476 return v_uint64x2(vrgather_vv_u64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2477}
2478
2479inline v_int64x2 v_reverse(const v_int64x2 &a)
2480{
2481 return v_int64x2(vrgather_vv_i64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2482}
2483
2484inline v_float64x2 v_reverse(const v_float64x2 &a)
2485{
2486 return v_float64x2(vrgather_vv_f64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2487}
2488
2489#define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
2490template <int n> \
2491inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2492{ return v_rotate_right<n>(a, b);}
2493OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint8x16, u8, 0)
2494OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int8x16, s8, 0)
2495OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint16x8, u16, 1)
2496OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int16x8, s16, 1)
2497OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint32x4, u32, 2)
2498OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int32x4, s32, 2)
2499OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint64x2, u64, 3)
2500OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int64x2, s64, 3)
2501OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
2502OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
2503
2504
2505#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix, vtype, _vtype, num, mvfunc) \
2506template<int i> inline _Tp v_extract_n(_Tpvec v) { vtype tmp = vundefined_##_vtype(); return mvfunc(vslidedown_vx_##_vtype(tmp, v.val, i, num)); }
2507
2508OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8, vuint8m1_t, u8m1, 16, vmv_x_s_u8m1_u8)
2509OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8, vint8m1_t, i8m1, 16, vmv_x_s_i8m1_i8)
2510OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16, vuint16m1_t, u16m1, 8, vmv_x_s_u16m1_u16)
2511OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16, vint16m1_t, i16m1, 8, vmv_x_s_i16m1_i16)
2512OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32, vuint32m1_t, u32m1, 4, vmv_x_s_u32m1_u32)
2513OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32, vint32m1_t, i32m1, 4, vmv_x_s_i32m1_i32)
2514OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64, vuint64m1_t, u64m1, 2, vmv_x_s_u64m1_u64)
2515OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64, vint64m1_t, i64m1, 2, vmv_x_s_i64m1_i64)
2516OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32, vfloat32m1_t, f32m1, 4, vfmv_f_s_f32m1_f32)
2517OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64, vfloat64m1_t, f64m1, 2, vfmv_f_s_f64m1_f64)
2518
2519#define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
2520template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
2521
2522OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint8x16, u8, 16)
2523OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int8x16, i8, 16)
2524OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint16x8, u16, 8)
2525OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int16x8, i16, 8)
2526OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint32x4, u32, 4)
2527OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
2528OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
2529OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
2530OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
2531
2532inline void __builtin_riscv_fsrm(int val)
2533{
2534 asm("csrw frm, %0\n\t"
2535 :
2536 :"r"(val));
2537 return;
2538}
2539
2540inline void barrier1(void *arg) {
2541 __asm__ __volatile__("" : : "r" (arg) : "memory");
2542}
2543
2544inline v_int32x4 v_round(const v_float32x4& a)
2545{
2546 __builtin_riscv_fsrm(0);
2547 vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2548 barrier1(&nan);
2549 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2550 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2551 __builtin_riscv_fsrm(0);
2552 return v_int32x4(val);
2553}
2554inline v_int32x4 v_floor(const v_float32x4& a)
2555{
2556 __builtin_riscv_fsrm(2);
2557 vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2558 barrier1(&nan);
2559 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2560 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2561 __builtin_riscv_fsrm(0);
2562 return v_int32x4(val);
2563}
2564
2565inline v_int32x4 v_ceil(const v_float32x4& a)
2566{
2567 __builtin_riscv_fsrm(3);
2568 vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2569 barrier1(&nan);
2570 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2571 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2572 __builtin_riscv_fsrm(0);
2573 return v_int32x4(val);
2574}
2575
2576inline v_int32x4 v_trunc(const v_float32x4& a)
2577{
2578 __builtin_riscv_fsrm(1);
2579 vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2580 barrier1(&nan);
2581 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2582 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2583 __builtin_riscv_fsrm(0);
2584 return v_int32x4(val);
2585}
2586
2587inline v_int32x4 v_round(const v_float64x2& a)
2588{
2589 __builtin_riscv_fsrm(0);
2590 vfloat64m2_t _val = vundefined_f64m2();
2591 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2592 //_val = vset_f64m2(_val, 1, a.val);
2593 _val = vset_v_f64m1_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
2594 barrier1(&_val);
2595 vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
2596 __builtin_riscv_fsrm(0);
2597 return v_int32x4(val);
2598}
2599inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2600{
2601 __builtin_riscv_fsrm(0);
2602 vfloat64m2_t _val = vundefined_f64m2();
2603 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2604 _val = vset_v_f64m1_f64m2(_val, 1, b.val);
2605 barrier1(&_val);
2606 vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
2607 __builtin_riscv_fsrm(0);
2608 return v_int32x4(val);
2609}
2610inline v_int32x4 v_floor(const v_float64x2& a)
2611{
2612 __builtin_riscv_fsrm(2);
2613 vfloat64m2_t _val = vundefined_f64m2();
2614 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2615 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2616 vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2617 barrier1(&nan);
2618 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2619 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2620 __builtin_riscv_fsrm(0);
2621 return v_int32x4(val);
2622}
2623
2624inline v_int32x4 v_ceil(const v_float64x2& a)
2625{
2626 __builtin_riscv_fsrm(3);
2627 vfloat64m2_t _val = vundefined_f64m2();
2628 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2629 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2630 vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2631 barrier1(&nan);
2632 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2633 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2634 __builtin_riscv_fsrm(0);
2635 return v_int32x4(val);
2636}
2637
2638inline v_int32x4 v_trunc(const v_float64x2& a)
2639{
2640 __builtin_riscv_fsrm(1);
2641 vfloat64m2_t _val = vundefined_f64m2();
2642 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2643 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2644 vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2645 barrier1(&nan);
2646 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2647 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2648 __builtin_riscv_fsrm(0);
2649 return v_int32x4(val);
2650}
2651
2652#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize) \
2653inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2654{ \
2655 intrin##2e##elemsize##_v_##_T##m1(&a.val, &b.val, ptr, num); \
2656} \
2657inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2658{ \
2659 intrin##3e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num); \
2660}\
2661inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2662 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2663{ \
2664 intrin##4e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num); \
2665} \
2666
2667#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize) \
2668inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2669 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2670{ \
2671 intrin##2e##elemsize##_v_##_T##m1(ptr, a.val, b.val, num); \
2672} \
2673inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2674 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2675{ \
2676 intrin##3e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, num); \
2677} \
2678inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2679 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2680 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2681{ \
2682 intrin##4e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num); \
2683}
2684
2685#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T, elemsize) \
2686OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T, elemsize) \
2687OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T, elemsize)
2688
2689//OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
2690OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8, 8)
2691OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16, 16)
2692OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32, 32)
2693
2694OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8, 8)
2695OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16, 16)
2696OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32, 32)
2697
2698#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T, _esize) \
2699inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2700{ vlseg2e##_esize##_v_##_T##m1(&a.val, &b.val, ptr, num);} \
2701inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2702{ vlseg3e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num);}\
2703inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2704 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2705{ vlseg4e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num);} \
2706inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2707 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2708{ vsseg2e##_esize##_v_##_T##m1(ptr, a.val, b.val, num);} \
2709inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2710 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2711{ vsseg3e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, num);} \
2712inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2713 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2714 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2715{ vsseg4e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num);}
2716
2717OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32, 32)
2718OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64, 64)
2719
2720OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64, 64)
2721OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64, 64)
2722
2723inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2724{
2725 return v_float32x4(vfcvt_f_x_v_f32m1(a.val, 4));
2726}
2727
2728#if CV_SIMD128_64F
2729inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2730{
2731 vfloat64m2_t _val = vundefined_f64m2();
2732 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2733 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2734 return v_float32x4(aval);
2735}
2736
2737inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2738{
2739 vfloat64m2_t _val = vundefined_f64m2();
2740 _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2741 _val = vset_v_f64m1_f64m2(_val, 1, b.val);
2742 vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 4);
2743 return v_float32x4(aval);
2744}
2745
2746inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2747{
2748 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2749 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2750 return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
2751}
2752
2753inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2754{
2755 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2756 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2757 return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
2758}
2759
2760inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2761{
2762 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2763 return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
2764}
2765
2766inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2767{
2768 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2769 return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
2770}
2771
2772inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2773{
2774 return v_float64x2(vfcvt_f_x_v_f64m1(a.val, 2));
2775}
2776
2777#endif
2778inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2779{
2780 uint64 mdata[2] = {0x0705060403010200, 0x0F0D0E0C0B090A08};
2781 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2782 return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
2783}
2784inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
2785{
2786 return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec)));
2787}
2788
2789inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2790{
2791 uint64 mdata[2] = {0x0703060205010400, 0x0F0B0E0A0D090C08};
2792 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2793 return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
2794}
2795inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
2796{
2797 return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec)));
2798}
2799
2800inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2801{
2802 uint64 mdata[2] = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
2803 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2804 return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2805}
2806inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2807inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2808{
2809 uint64 mdata[2] = {0x0B0A030209080100, 0x0F0E07060D0C0504};
2810 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2811 return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2812}
2813inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2814
2815inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2816{
2817 uint64 mdata[2] = {0x0B0A090803020100, 0x0F0E0D0C07060504};
2818 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2819 return v_int32x4(vreinterpret_v_i8m1_i32m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2820}
2821inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2822inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2823inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2824{
2825 uint64 mdata[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
2826 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2827 return v_int8x16(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vec.val), vreinterpret_v_u64m1_u8m1(m0), 16)));
2828}
2829inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2830
2831inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2832{
2833 uint64 mdata[2] = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
2834 vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2835 return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2836}
2837inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2838
2839inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2840inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2841inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2842
2843#if CV_SIMD128_64F
2844inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
2845{ return v_cvt_f64(v_dotprod(a, b)); }
2846inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
2847 const v_float64x2& c)
2848{ return v_dotprod_expand(a, b) + c; }
2849inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
2850{
2851 vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
2852 vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2), 2);
2853 return v_float64x2(res);
2854}
2855inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
2857 return res + c; }
2858#endif
2860#if __riscv_v == 7000
2861inline v_float32x4 v_load_expand(const hfloat* ptr)
2862{
2863 vfloat16m1_t v = vle16_v_f16m1((__fp16*)ptr, 4);
2864 vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
2865 return v_float32x4(vget_v_f32m2_f32m1(v32, 0));
2866}
2867
2868inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2869{
2870 vfloat32m2_t v32 = vundefined_f32m2();
2871 v32 = vset_v_f32m1_f32m2(v32, 0, v.val);
2872 vfloat16m1_t hv = vfncvt_f_f_w_f16m1(v32, 4);
2873 vse16_v_f16m1((__fp16*)ptr, hv, 4);
2874}
2875#else
2876inline v_float32x4 v_load_expand(const hfloat* ptr)
2877{
2878 vfloat16mf2_t v = vle16_v_f16mf2((__fp16*)ptr, 4);
2879 vfloat32m1_t v32 = vfwcvt_f_f_v_f32m1(v, 4);
2880 return v_float32x4(v32);
2881}
2882
2883inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2884{
2885 //vfloat32m2_t v32 = vundefined_f32m2();
2886 //v32 = vset_f32m2(v32, 0, v.val);
2887 vfloat16mf2_t hv = vfncvt_f_f_w_f16mf2(v.val, 4);
2888 vse16_v_f16mf2((__fp16*)ptr, hv, 4);
2889}
2890#endif
2891
2892inline void v_cleanup() {}
2893
2894CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2895
2897
2898}
2899#endif
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
static bool operator!=(const Matx< _Tp, m, n > &a, const Matx< _Tp, m, n > &b)
static bool operator==(const Matx< _Tp, m, n > &a, const Matx< _Tp, m, n > &b)
const int * idx
Definition core_c.h:668
int index
Definition core_c.h:634
const CvSeq * seq
Definition core_c.h:1548
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
CV_INLINE v_reg< _Tp, n > & operator/=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition intrin_cpp.hpp:1033
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition intrin_cpp.hpp:1020
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition intrin_cpp.hpp:1451
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition intrin_cpp.hpp:1961
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
OutputArray sum
Definition imgproc.hpp:2882
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441
static bool operator<(const FileNodeIterator &it1, const FileNodeIterator &it2)
Definition persistence.hpp:1303
T nan(T... args)