EstervQrCode 1.1.1
Library for qr code manipulation
intrin_rvv071.hpp
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4 
5 // Copyright (C) 2015, PingTouGe Semiconductor Co., Ltd., all rights reserved.
6 
7 #ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
8 #define OPENCV_HAL_INTRIN_RISCVV_HPP
9 
10 #include <float.h>
11 #include <algorithm>
12 #include "opencv2/core/utility.hpp"
13 
14 namespace cv
15 {
16 
18 
19 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
20 
21 #define CV_SIMD128 1
22 #define CV_SIMD128_64F 1
24 struct v_uint8x16
25 {
26  typedef uchar lane_type;
27  enum { nlanes = 16 };
28 
29  v_uint8x16() {}
30  explicit v_uint8x16(vuint8m1_t v) : val(v) {}
31  v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
32  uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
33  {
34  uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
35  val = (vuint8m1_t)vle8_v_u8m1((unsigned char*)v, 16);
36  }
37  uchar get0() const
38  {
39  return vmv_x_s_u8m1_u8(val);
40  }
41 
42  vuint8m1_t val;
43 };
44 
45 struct v_int8x16
46 {
47  typedef schar lane_type;
48  enum { nlanes = 16 };
49 
50  v_int8x16() {}
51  explicit v_int8x16(vint8m1_t v) : val(v) {}
52  v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53  schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
54  {
55  schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56  val = (vint8m1_t)vle8_v_i8m1((schar*)v, 16);
57  }
58  schar get0() const
59  {
60  return vmv_x_s_i8m1_i8(val);
61  }
62 
63  vint8m1_t val;
64 };
65 
66 struct v_uint16x8
67 {
68  typedef ushort lane_type;
69  enum { nlanes = 8 };
70 
71  v_uint16x8() {}
72  explicit v_uint16x8(vuint16m1_t v) : val(v) {}
73  v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
74  {
75  ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
76  val = (vuint16m1_t)vle16_v_u16m1((unsigned short*)v, 8);
77  }
78  ushort get0() const
79  {
80  return vmv_x_s_u16m1_u16(val);
81  }
82 
83  vuint16m1_t val;
84 };
85 
86 struct v_int16x8
87 {
88  typedef short lane_type;
89  enum { nlanes = 8 };
90 
91  v_int16x8() {}
92  explicit v_int16x8(vint16m1_t v) : val(v) {}
93  v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
94  {
95  short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96  val = (vint16m1_t)vle16_v_i16m1((signed short*)v, 8);
97  }
98  short get0() const
99  {
100  return vmv_x_s_i16m1_i16(val);
101  }
102 
103  vint16m1_t val;
104 };
105 
106 struct v_uint32x4
107 {
108  typedef unsigned lane_type;
109  enum { nlanes = 4 };
110 
111  v_uint32x4() {}
112  explicit v_uint32x4(vuint32m1_t v) : val(v) {}
113  v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
114  {
115  unsigned v[] = {v0, v1, v2, v3};
116  val = (vuint32m1_t)vle32_v_u32m1((unsigned int*)v, 4);
117  }
118  unsigned get0() const
119  {
120  return vmv_x_s_u32m1_u32(val);
121  }
122 
123  vuint32m1_t val;
124 };
125 
126 struct v_int32x4
127 {
128  typedef int lane_type;
129  enum { nlanes = 4 };
130 
131  v_int32x4() {}
132  explicit v_int32x4(vint32m1_t v) : val(v) {}
133  v_int32x4(int v0, int v1, int v2, int v3)
134  {
135  int v[] = {v0, v1, v2, v3};
136  val = (vint32m1_t)vle32_v_i32m1((signed int*)v, 4);
137  }
138  int get0() const
139  {
140  return vmv_x_s_i32m1_i32(val);
141  }
142  vint32m1_t val;
143 };
144 
145 struct v_float32x4
146 {
147  typedef float lane_type;
148  enum { nlanes = 4 };
149 
150  v_float32x4() {}
151  explicit v_float32x4(vfloat32m1_t v) : val(v) {}
152  v_float32x4(float v0, float v1, float v2, float v3)
153  {
154  float v[] = {v0, v1, v2, v3};
155  val = (vfloat32m1_t)vle32_v_f32m1((float*)v, 4);
156  }
157  float get0() const
158  {
159  return vfmv_f_s_f32m1_f32(val);
160  }
161  vfloat32m1_t val;
162 };
163 
164 struct v_uint64x2
165 {
166  typedef uint64 lane_type;
167  enum { nlanes = 2 };
168 
169  v_uint64x2() {}
170  explicit v_uint64x2(vuint64m1_t v) : val(v) {}
171  v_uint64x2(uint64 v0, uint64 v1)
172  {
173  uint64 v[] = {v0, v1};
174  val = (vuint64m1_t)vle64_v_u64m1((unsigned long*)v, 2);
175  }
176  uint64 get0() const
177  {
178  return vmv_x_s_u64m1_u64(val);
179  }
180  vuint64m1_t val;
181 };
182 
183 struct v_int64x2
184 {
185  typedef int64 lane_type;
186  enum { nlanes = 2 };
187 
188  v_int64x2() {}
189  explicit v_int64x2(vint64m1_t v) : val(v) {}
190  v_int64x2(int64 v0, int64 v1)
191  {
192  int64 v[] = {v0, v1};
193  val = (vint64m1_t)vle64_v_i64m1((long*)v, 2);
194  }
195  int64 get0() const
196  {
197  return vmv_x_s_i64m1_i64(val);
198  }
199  vint64m1_t val;
200 };
201 
202 struct v_float64x2
203 {
204  typedef double lane_type;
205  enum { nlanes = 2 };
206 
207  v_float64x2() {}
208  explicit v_float64x2(vfloat64m1_t v) : val(v) {}
209  v_float64x2(double v0, double v1)
210  {
211  double v[] = {v0, v1};
212  val = (vfloat64m1_t)vle64_v_f64m1((double*)v, 2);
213  }
214  double get0() const
215  {
216  return vfmv_f_s_f64m1_f64(val);
217  }
218  vfloat64m1_t val;
219 };
220 /*
221 #define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
222 inline _Tp##m1_t vreinterpret_v_##suffix##m1_##suffix##m1(_Tp##m1_t v) { return v; } \
223 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
224 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
225 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
226 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); } \
227 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
228 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
229 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
230 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \
231 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\
232 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); }
233 
234 
235 OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
236 OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, i8)
237 OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
238 OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, i16)
239 OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
240 OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, i32)
241 OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
242 OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, i64)
243 OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
244 OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
245 */
246 inline v_uint8x16 v_reinterpret_as_u8(const v_uint8x16& v) { return v_uint8x16(v.val); }
247 inline v_int8x16 v_reinterpret_as_s8(const v_uint8x16& v) { return v_int8x16(vreinterpret_v_u8m1_i8m1(v.val)); }
248 inline v_uint16x8 v_reinterpret_as_u16(const v_uint8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(v.val)); }
249 inline v_int16x8 v_reinterpret_as_s16(const v_uint8x16& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(vreinterpret_v_u8m1_u16m1(v.val))); }
250 inline v_uint32x4 v_reinterpret_as_u32(const v_uint8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(v.val)); }
251 inline v_int32x4 v_reinterpret_as_s32(const v_uint8x16& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
252 inline v_uint64x2 v_reinterpret_as_u64(const v_uint8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(v.val)); }
253 inline v_int64x2 v_reinterpret_as_s64(const v_uint8x16& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
254 inline v_float32x4 v_reinterpret_as_f32(const v_uint8x16& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
255 inline v_float64x2 v_reinterpret_as_f64(const v_uint8x16& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
256 
257 inline v_uint8x16 v_reinterpret_as_u8(const v_int8x16& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(v.val)); }
258 inline v_int8x16 v_reinterpret_as_s8(const v_int8x16& v) { return v_int8x16(v.val); }
259 inline v_uint16x8 v_reinterpret_as_u16(const v_int8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(vreinterpret_v_i8m1_u8m1(v.val))); }
260 inline v_int16x8 v_reinterpret_as_s16(const v_int8x16& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); }
261 inline v_uint32x4 v_reinterpret_as_u32(const v_int8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(vreinterpret_v_i8m1_u8m1(v.val))); }
262 inline v_int32x4 v_reinterpret_as_s32(const v_int8x16& v) { return v_int32x4(vreinterpret_v_i8m1_i32m1(v.val)); }
263 inline v_uint64x2 v_reinterpret_as_u64(const v_int8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(vreinterpret_v_i8m1_u8m1(v.val))); }
264 inline v_int64x2 v_reinterpret_as_s64(const v_int8x16& v) { return v_int64x2(vreinterpret_v_i8m1_i64m1(v.val)); }
265 inline v_float32x4 v_reinterpret_as_f32(const v_int8x16& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i8m1_i32m1(v.val))); }
266 inline v_float64x2 v_reinterpret_as_f64(const v_int8x16& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i8m1_i64m1(v.val))); }
267 
268 inline v_uint8x16 v_reinterpret_as_u8(const v_uint16x8& v) { return v_uint8x16(vreinterpret_v_u16m1_u8m1(v.val)); }
269 inline v_int8x16 v_reinterpret_as_s8(const v_uint16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(vreinterpret_v_u16m1_i16m1(v.val))); }
270 inline v_uint16x8 v_reinterpret_as_u16(const v_uint16x8& v) { return v_uint16x8(v.val); }
271 inline v_int16x8 v_reinterpret_as_s16(const v_uint16x8& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(v.val)); }
272 inline v_uint32x4 v_reinterpret_as_u32(const v_uint16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(v.val)); }
273 inline v_int32x4 v_reinterpret_as_s32(const v_uint16x8& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
274 inline v_uint64x2 v_reinterpret_as_u64(const v_uint16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(v.val)); }
275 inline v_int64x2 v_reinterpret_as_s64(const v_uint16x8& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
276 inline v_float32x4 v_reinterpret_as_f32(const v_uint16x8& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
277 inline v_float64x2 v_reinterpret_as_f64(const v_uint16x8& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
278 
279 inline v_uint8x16 v_reinterpret_as_u8(const v_int16x8& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(v.val))); }
280 inline v_int8x16 v_reinterpret_as_s8(const v_int16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(v.val)); }
281 inline v_uint16x8 v_reinterpret_as_u16(const v_int16x8& v) { return v_uint16x8(vreinterpret_v_i16m1_u16m1(v.val)); }
282 inline v_int16x8 v_reinterpret_as_s16(const v_int16x8& v) { return v_int16x8(v.val); }
283 inline v_uint32x4 v_reinterpret_as_u32(const v_int16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(vreinterpret_v_i16m1_u16m1(v.val))); }
284 inline v_int32x4 v_reinterpret_as_s32(const v_int16x8& v) { return v_int32x4(vreinterpret_v_i16m1_i32m1(v.val)); }
285 inline v_uint64x2 v_reinterpret_as_u64(const v_int16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(vreinterpret_v_i16m1_u16m1(v.val))); }
286 inline v_int64x2 v_reinterpret_as_s64(const v_int16x8& v) { return v_int64x2(vreinterpret_v_i16m1_i64m1(v.val)); }
287 inline v_float32x4 v_reinterpret_as_f32(const v_int16x8& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i16m1_i32m1(v.val))); }
288 inline v_float64x2 v_reinterpret_as_f64(const v_int16x8& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i16m1_i64m1(v.val))); }
289 
290 inline v_uint8x16 v_reinterpret_as_u8(const v_uint32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(v.val)); }
291 inline v_int8x16 v_reinterpret_as_s8(const v_uint32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_u32m1_i32m1(v.val))); }
292 inline v_uint16x8 v_reinterpret_as_u16(const v_uint32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(v.val)); }
293 inline v_int16x8 v_reinterpret_as_s16(const v_uint32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_u32m1_i32m1(v.val))); }
294 inline v_uint32x4 v_reinterpret_as_u32(const v_uint32x4& v) { return v_uint32x4(v.val); }
295 inline v_int32x4 v_reinterpret_as_s32(const v_uint32x4& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(v.val)); }
296 inline v_uint64x2 v_reinterpret_as_u64(const v_uint32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(v.val)); }
297 inline v_int64x2 v_reinterpret_as_s64(const v_uint32x4& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
298 inline v_float32x4 v_reinterpret_as_f32(const v_uint32x4& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(v.val)); }
299 inline v_float64x2 v_reinterpret_as_f64(const v_uint32x4& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
300 
301 inline v_uint8x16 v_reinterpret_as_u8(const v_int32x4& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(v.val))); }
302 inline v_int8x16 v_reinterpret_as_s8(const v_int32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(v.val)); }
303 inline v_uint16x8 v_reinterpret_as_u16(const v_int32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_i32m1_u32m1(v.val))); }
304 inline v_int16x8 v_reinterpret_as_s16(const v_int32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(v.val)); }
305 inline v_uint32x4 v_reinterpret_as_u32(const v_int32x4& v) { return v_uint32x4(vreinterpret_v_i32m1_u32m1(v.val)); }
306 inline v_int32x4 v_reinterpret_as_s32(const v_int32x4& v) { return v_int32x4(v.val); }
307 inline v_uint64x2 v_reinterpret_as_u64(const v_int32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_i32m1_u32m1(v.val))); }
308 inline v_int64x2 v_reinterpret_as_s64(const v_int32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(v.val)); }
309 inline v_float32x4 v_reinterpret_as_f32(const v_int32x4& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(v.val)); }
310 inline v_float64x2 v_reinterpret_as_f64(const v_int32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(v.val))); }
311 
312 inline v_uint8x16 v_reinterpret_as_u8(const v_uint64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(v.val)); }
313 inline v_int8x16 v_reinterpret_as_s8(const v_uint64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_u64m1_i64m1(v.val))); }
314 inline v_uint16x8 v_reinterpret_as_u16(const v_uint64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(v.val)); }
315 inline v_int16x8 v_reinterpret_as_s16(const v_uint64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_u64m1_i64m1(v.val))); }
316 inline v_uint32x4 v_reinterpret_as_u32(const v_uint64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(v.val)); }
317 inline v_int32x4 v_reinterpret_as_s32(const v_uint64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_u64m1_i64m1(v.val))); }
318 inline v_uint64x2 v_reinterpret_as_u64(const v_uint64x2& v) { return v_uint64x2(v.val); }
319 inline v_int64x2 v_reinterpret_as_s64(const v_uint64x2& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(v.val)); }
320 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(v.val))); }
321 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(v.val)); }
322 
323 inline v_uint8x16 v_reinterpret_as_u8(const v_int64x2& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(v.val))); }
324 inline v_int8x16 v_reinterpret_as_s8(const v_int64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(v.val)); }
325 inline v_uint16x8 v_reinterpret_as_u16(const v_int64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_i64m1_u64m1(v.val))); }
326 inline v_int16x8 v_reinterpret_as_s16(const v_int64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(v.val)); }
327 inline v_uint32x4 v_reinterpret_as_u32(const v_int64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_i64m1_u64m1(v.val))); }
328 inline v_int32x4 v_reinterpret_as_s32(const v_int64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(v.val)); }
329 inline v_uint64x2 v_reinterpret_as_u64(const v_int64x2& v) { return v_uint64x2(vreinterpret_v_i64m1_u64m1(v.val)); }
330 inline v_int64x2 v_reinterpret_as_s64(const v_int64x2& v) { return v_int64x2(v.val); }
331 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(v.val))); }
332 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(v.val)); }
333 
334 inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(vreinterpret_v_f32m1_u32m1(v.val))); }
335 inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_f32m1_i32m1(v.val))); }
336 inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_f32m1_u32m1(v.val))); }
337 inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_f32m1_i32m1(v.val))); }
338 inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& v) { return v_uint32x4(vreinterpret_v_f32m1_u32m1(v.val)); }
339 inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& v) { return v_int32x4(vreinterpret_v_f32m1_i32m1(v.val)); }
340 inline v_uint64x2 v_reinterpret_as_u64(const v_float32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v.val))); }
341 inline v_int64x2 v_reinterpret_as_s64(const v_float32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val))); }
342 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& v) { return v_float32x4(v.val); }
343 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val)))); }
344 
345 inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(vreinterpret_v_f64m1_u64m1(v.val))); }
346 inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_f64m1_i64m1(v.val))); }
347 inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_f64m1_u64m1(v.val))); }
348 inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_f64m1_i64m1(v.val))); }
349 inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v.val))); }
350 inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val))); }
351 inline v_uint64x2 v_reinterpret_as_u64(const v_float64x2& v) { return v_uint64x2(vreinterpret_v_f64m1_u64m1(v.val)); }
352 inline v_int64x2 v_reinterpret_as_s64(const v_float64x2& v) { return v_int64x2(vreinterpret_v_f64m1_i64m1(v.val)); }
353 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val)))); }
354 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64x2(v.val); }
355 
356 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
357 inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); } \
358 inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
359 
360 OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
361 OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
362 OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
363 OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
364 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
365 OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
366 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
367 OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
368 inline v_float32x4 v_setzero_f32() { return v_float32x4(vfmv_v_f_f32m1(0, 4)); }
369 inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
370 
371 inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
372 inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
373 
374 
375 #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
376 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
377 { \
378  return _Tpvec(intrin(a.val, b.val)); \
379 } \
380 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
381 { \
382  a.val = intrin(a.val, b.val); \
383  return a; \
384 }
385 
386 #define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
387 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
388 { \
389  return _Tpvec(intrin(a.val, b.val, num)); \
390 } \
391 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
392 { \
393  a.val = intrin(a.val, b.val, num); \
394  return a; \
395 }
396 
397 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
398 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
399 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
400 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
401 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
402 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
403 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
404 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
405 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vadd_vv_i32m1, 4)
406 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vsub_vv_i32m1, 4)
407 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
408 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
409 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
410 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
411 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vadd_vv_i64m1, 2)
412 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vsub_vv_i64m1, 2)
413 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
414 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
415 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
416 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
417 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
418 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
419 {
420  return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
421 }
422 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
423 {
424  a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
425  return a;
426 }
427 
428 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
429 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
430 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
431 inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
432 {
433  return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
434 }
435 inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
436 {
437  a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
438  return a;
439 }
440 // TODO: exp, log, sin, cos
441 
442 #define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
443 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
444 { \
445  return _Tpvec(intrin(a.val, b.val)); \
446 }
447 
448 #define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
449 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
450 { \
451  return _Tpvec(intrin(a.val, b.val, num)); \
452 }
453 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
454 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
455 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
456 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
457 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
458 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
459 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
460 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
461 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
462 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
463 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
464 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
465 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
466 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
467 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
468 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
469 
470 inline v_float32x4 v_sqrt(const v_float32x4& x)
471 {
472  return v_float32x4(vfsqrt_v_f32m1(x.val, 4));
473 }
474 
475 inline v_float32x4 v_invsqrt(const v_float32x4& x)
476 {
477  return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4));
478 }
479 
480 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
481 {
482  v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
483  return v_sqrt(x);
484 }
485 
486 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
487 {
488  return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
489 }
490 
491 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
492 {
493  return v_float32x4(vfmadd_vv_f32m1(a.val, b.val, c.val, 4));
494 }
495 
496 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
497 {
498  return v_int32x4(vmadd_vv_i32m1(a.val, b.val, c.val, 4));
499 }
500 
501 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
502 {
503  return v_fma(a, b, c);
504 }
505 
506 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
507 {
508  return v_fma(a, b, c);
509 }
510 
511 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
512  const v_float32x4& m1, const v_float32x4& m2,
513  const v_float32x4& m3)
514 {
515  vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
516  res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
517  res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
518  res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 3, 4), m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
519  return v_float32x4(res);
520 }
521 
522 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
523  const v_float32x4& m1, const v_float32x4& m2,
524  const v_float32x4& a)
525 {
526  vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
527  res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
528  res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
529  res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
530  return v_float32x4(res);
531 }
532 
533 inline v_float64x2 v_sqrt(const v_float64x2& x)
534 {
535  return v_float64x2(vfsqrt_v_f64m1(x.val, 2));
536 }
537 
538 inline v_float64x2 v_invsqrt(const v_float64x2& x)
539 {
540  return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2));
541 }
542 
543 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
544 {
545  v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
546  return v_sqrt(x);
547 }
548 
549 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
550 {
551  return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
552 }
553 
554 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
555 {
556  return v_float64x2(vfmadd_vv_f64m1(a.val, b.val, c.val, 2));
557 }
558 
559 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
560 {
561  return v_fma(a, b, c);
562 }
563 
564 #define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
565  OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
566  OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
567  OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
568  inline _Tpvec operator ~ (const _Tpvec & a) \
569  { \
570  return _Tpvec(vnot_v_##suffix(a.val, num)); \
571  }
572 
573 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint8x16, u8m1, 16)
574 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint16x8, u16m1, 8)
575 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint32x4, u32m1, 4)
576 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint64x2, u64m1, 2)
577 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int8x16, i8m1, 16)
578 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int16x8, i16m1, 8)
579 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4, i32m1, 4)
580 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2)
581 
582 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
583 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
584 { \
585  return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
586 } \
587 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
588 { \
589  a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
590  return a; \
591 }
592 
593 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
594 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
595 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
596 
597 inline v_float32x4 operator ~ (const v_float32x4& a)
598 {
599  return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
600 }
601 
602 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
603 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
604 { \
605  return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
606 } \
607 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
608 { \
609  a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
610  return a; \
611 }
612 
613 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
614 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
615 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
616 
617 inline v_float64x2 operator ~ (const v_float64x2& a)
618 {
619  return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
620 }
621 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
622 {
623  return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
624 }
625 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
626 {
627  return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
628 }
629 
630 //#define OPENCV_HAL_IMPL_RISCVV_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
631 //inline _Tpuvec v_abs(const _Tpsvec& a) { \
632 // E##xm1_t mask=vmflt_vf_e32xm1_f32m1(x.val, 0.0, 4);
633 
634 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint8x16, v_int8x16, u8, s8)
635 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint16x8, v_int16x8, u16, s16)
636 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint32x4, v_int32x4, u32, s32)
637 
638 inline v_uint32x4 v_abs(v_int32x4 x)
639 {
640  vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
641  return v_uint32x4(vreinterpret_v_i32m1_u32m1(vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4)));
642 }
643 
644 inline v_uint16x8 v_abs(v_int16x8 x)
645 {
646  vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
647  return v_uint16x8(vreinterpret_v_i16m1_u16m1(vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8)));
648 }
649 
650 inline v_uint8x16 v_abs(v_int8x16 x)
651 {
652  vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
653  return v_uint8x16(vreinterpret_v_i8m1_u8m1(vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16)));
654 }
655 
656 inline v_float32x4 v_abs(v_float32x4 x)
657 {
658  return (v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4);
659 }
660 
661 inline v_float64x2 v_abs(v_float64x2 x)
662 {
663  return (v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2);
664 }
665 
666 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
667 {
668  vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
669  return (v_float32x4)vfsgnjx_vv_f32m1(ret, ret, 4);
670 }
671 
672 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
673 {
674  vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
675  return (v_float64x2)vfsgnjx_vv_f64m1(ret, ret, 2);
676 }
677 
678 #define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
679 inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){ \
680  vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num); \
681  vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num); \
682  return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
683 }
684 
685 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
686 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
687 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
688 
691  vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
692  vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
693  return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
694 }
696  vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
697  vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
698  return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
699 }
700 
701 #define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
702 inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){ \
703  vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
704  vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
705  return v_uint##_Tpvec(vreinterpret_v_i##_Tpv##_u##_Tpv(vsub_vv_i##_Tpv(max, min, num))); \
706 }
707 
708 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
709 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
710 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
711 
712 // Multiply and expand
713 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
714  v_int16x8& c, v_int16x8& d)
715 {
716  vint16m2_t res = vundefined_i16m2();
717  res = vwmul_vv_i16m2(a.val, b.val, 16);
718  c.val = vget_v_i16m2_i16m1(res, 0);
719  d.val = vget_v_i16m2_i16m1(res, 1);
720 }
721 
722 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
723  v_uint16x8& c, v_uint16x8& d)
724 {
725  vuint16m2_t res = vundefined_u16m2();
726  res = vwmulu_vv_u16m2(a.val, b.val, 16);
727  c.val = vget_v_u16m2_u16m1(res, 0);
728  d.val = vget_v_u16m2_u16m1(res, 1);
729 }
730 
731 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
732  v_int32x4& c, v_int32x4& d)
733 {
734  vint32m2_t res = vundefined_i32m2();
735  res = vwmul_vv_i32m2(a.val, b.val, 8);
736  c.val = vget_v_i32m2_i32m1(res, 0);
737  d.val = vget_v_i32m2_i32m1(res, 1);
738 }
739 
740 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
741  v_uint32x4& c, v_uint32x4& d)
742 {
743  vuint32m2_t res = vundefined_u32m2();
744  res = vwmulu_vv_u32m2(a.val, b.val, 8);
745  c.val = vget_v_u32m2_u32m1(res, 0);
746  d.val = vget_v_u32m2_u32m1(res, 1);
747 }
748 
749 inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
750  v_int64x2& c, v_int64x2& d)
751 {
752  vint64m2_t res = vundefined_i64m2();
753  res = vwmul_vv_i64m2(a.val, b.val, 4);
754  c.val = vget_v_i64m2_i64m1(res, 0);
755  d.val = vget_v_i64m2_i64m1(res, 1);
756 }
757 
758 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
759  v_uint64x2& c, v_uint64x2& d)
760 {
761  vuint64m2_t res = vundefined_u64m2();
762  res = vwmulu_vv_u64m2(a.val, b.val, 4);
763  c.val = vget_v_u64m2_u64m1(res, 0);
764  d.val = vget_v_u64m2_u64m1(res, 1);
765 }
766 
767 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
768 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
769 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
770 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
771 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
772 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
773 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
774 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
775 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
776 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
777 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
778 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
780 // 16 >> 32
781 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
782 {
783  vuint32m2_t vindex = vundefined_u32m2();
784  vuint32m1_t vindex0 = vid_v_u32m1(4);
785  vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
786  vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
787  vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
788  vint32m2_t res = vundefined_i32m2();
789  res = vwmul_vv_i32m2(a.val, b.val, 8);
790  res = vrgather_vv_i32m2(res, vindex, 8);
791  return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0), vget_v_i32m2_i32m1(res, 1), 4));
792 }
793 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
794 {
795  vuint32m2_t vindex = vundefined_u32m2();
796  vuint32m1_t vindex0 = vid_v_u32m1(4);
797  vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
798  vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
799  vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
800  vint32m2_t res = vundefined_i32m2();
801  res = vwmul_vv_i32m2(a.val, b.val, 8);
802  res = vrgather_vv_i32m2(res, vindex, 8);
803  return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0),vget_v_i32m2_i32m1(res, 1), 4), c.val, 4));
804 }
805 
806 // 32 >> 64
807 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
808 {
809  vuint64m2_t vindex = vundefined_u64m2();
810  vuint64m1_t vindex0 = vid_v_u64m1(2);
811  vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
812  vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
813  vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
814  vint64m2_t res = vundefined_i64m2();
815  res = vwmul_vv_i64m2(a.val, b.val, 4);
816  res = vrgather_vv_i64m2(res, vindex, 4);
817  return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2));
818 }
819 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
820 {
821  vuint64m2_t vindex = vundefined_u64m2();
822  vuint64m1_t vindex0 = vid_v_u64m1(2);
823  vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
824  vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
825  vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
826  vint64m2_t res = vundefined_i64m2();
827  res = vwmul_vv_i64m2(a.val, b.val, 4);
828  res = vrgather_vv_i64m2(res, vindex, 4);
829  return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2), c.val, 2));
830 }
831 
832 // 8 >> 32
833 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
834 {
835  vuint32m4_t vindex32 = vundefined_u32m4();
836  vuint32m1_t vindex0 = vid_v_u32m1(4);
837  vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
838  vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
839  vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
840  vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
841  vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
842  vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
843  vuint16m2_t v1 = vundefined_u16m2();
844  vuint32m2_t v2 = vundefined_u32m2();
845  v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
846  v1 = vrgather_vv_u16m2(v1, vindex, 16);
847  v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
848  return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
849 }
850 
851 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
852  const v_uint32x4& c)
853 {
854  vuint32m4_t vindex32 = vundefined_u32m4();
855  vuint32m1_t vindex0 = vid_v_u32m1(4);
856  vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
857  vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
858  vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
859  vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
860  vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
861  vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
862  vuint16m2_t v1 = vundefined_u16m2();
863  vuint32m2_t v2 = vundefined_u32m2();
864  v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
865  v1 = vrgather_vv_u16m2(v1, vindex, 16);
866  v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
867  return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
868 }
869 
870 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
871 {
872  vuint32m4_t vindex32 = vundefined_u32m4();
873  vuint32m1_t vindex0 = vid_v_u32m1(4);
874  vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
875  vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
876  vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
877  vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
878  vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
879  vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
880  vint16m2_t v1 = vundefined_i16m2();
881  vint32m2_t v2 = vundefined_i32m2();
882  v1 = vwmul_vv_i16m2(a.val, b.val, 16);
883  v1 = vrgather_vv_i16m2(v1, vindex, 16);
884  v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
885  return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
886 }
887 
888 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
889  const v_int32x4& c)
890 {
891  vuint32m4_t vindex32 = vundefined_u32m4();
892  vuint32m1_t vindex0 = vid_v_u32m1(4);
893  vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
894  vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
895  vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
896  vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
897  vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
898  vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
899  vint16m2_t v1 = vundefined_i16m2();
900  vint32m2_t v2 = vundefined_i32m2();
901  v1 = vwmul_vv_i16m2(a.val, b.val, 16);
902  v1 = vrgather_vv_i16m2(v1, vindex, 16);
903  v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
904  return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
905 }
906 
907 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
908 {
909  vuint64m4_t vindex64 = vundefined_u64m4();
910  vuint64m1_t vindex0 = vid_v_u64m1(2);
911  vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
912  vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
913  vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
914  vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
915  vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
916  vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
917  vuint32m2_t v1 = vundefined_u32m2();
918  vuint64m2_t v2 = vundefined_u64m2();
919  v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
920  v1 = vrgather_vv_u32m2(v1, vindex, 8);
921  v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
922  return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
923 }
924 
925 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
926  const v_uint64x2& c)
927 {
928  vuint64m4_t vindex64 = vundefined_u64m4();
929  vuint64m1_t vindex0 = vid_v_u64m1(2);
930  vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
931  vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
932  vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
933  vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
934  vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
935  vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
936  vuint32m2_t v1 = vundefined_u32m2();
937  vuint64m2_t v2 = vundefined_u64m2();
938  v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
939  v1 = vrgather_vv_u32m2(v1, vindex, 8);
940  v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
941  return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
942 }
943 
944 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
945 {
946  vuint64m4_t vindex64 = vundefined_u64m4();
947  vuint64m1_t vindex0 = vid_v_u64m1(2);
948  vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
949  vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
950  vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
951  vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
952  vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
953  vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
954  vint32m2_t v1 = vundefined_i32m2();
955  vint64m2_t v2 = vundefined_i64m2();
956  v1 = vwmul_vv_i32m2(a.val, b.val, 8);
957  v1 = vrgather_vv_i32m2(v1, vindex, 8);
958  v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
959  return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
960 }
961 
962 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
963  const v_int64x2& c)
964 {
965  vuint64m4_t vindex64 = vundefined_u64m4();
966  vuint64m1_t vindex0 = vid_v_u64m1(2);
967  vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
968  vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
969  vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
970  vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
971  vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
972  vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
973  vint32m2_t v1 = vundefined_i32m2();
974  vint64m2_t v2 = vundefined_i64m2();
975  v1 = vwmul_vv_i32m2(a.val, b.val, 8);
976  v1 = vrgather_vv_i32m2(v1, vindex, 8);
977  v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
978  return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
979 }
980 
982 // 16 >> 32
983 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
984 {
985  vint32m2_t v1 = vundefined_i32m2();
986  v1 = vwmul_vv_i32m2(a.val, b.val, 8);
987  return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4));
988 }
989 
990 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
991 {
992  vint32m2_t v1 = vundefined_i32m2();
993  v1 = vwmul_vv_i32m2(a.val, b.val, 8);
994  return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4), c.val, 4));
995 }
996 
997 // 32 >> 64
998 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
999 {
1000  vint64m2_t v1 = vundefined_i64m2();
1001  v1 = vwmul_vv_i64m2(a.val, b.val, 4);
1002  return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2));
1003 }
1004 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1005 {
1006  vint64m2_t v1 = vundefined_i64m2();
1007  v1 = vwmul_vv_i64m2(a.val, b.val, 8);
1008  return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 4), c.val, 4));
1009 }
1010 
1011 // 8 >> 32
1012 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1013 {
1014  vuint16m2_t v1 = vundefined_u16m2();
1015  vuint32m2_t v2 = vundefined_u32m2();
1016  v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
1017  v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
1018  return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
1019 }
1020 
1021 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1022 {
1023  vuint16m2_t v1 = vundefined_u16m2();
1024  vuint32m2_t v2 = vundefined_u32m2();
1025  v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
1026  v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
1027  return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
1028 }
1029 
1030 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1031 {
1032  vint16m2_t v1 = vundefined_i16m2();
1033  vint32m2_t v2 = vundefined_i32m2();
1034  v1 = vwmul_vv_i16m2(a.val, b.val, 16);
1035  v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
1036  return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
1037 }
1038 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1039 {
1040  vint16m2_t v1 = vundefined_i16m2();
1041  vint32m2_t v2 = vundefined_i32m2();
1042  v1 = vwmul_vv_i16m2(a.val, b.val, 16);
1043  v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
1044  return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
1045 }
1046 
1047 // 16 >> 64
1048 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1049 {
1050  vuint32m2_t v1 = vundefined_u32m2();
1051  vuint64m2_t v2 = vundefined_u64m2();
1052  v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
1053  v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
1054  return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
1055 }
1056 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1057 {
1058  vuint32m2_t v1 = vundefined_u32m2();
1059  vuint64m2_t v2 = vundefined_u64m2();
1060  v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
1061  v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
1062  return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
1063 }
1064 
1065 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1066 {
1067  vint32m2_t v1 = vundefined_i32m2();
1068  vint64m2_t v2 = vundefined_i64m2();
1069  v1 = vwmul_vv_i32m2(a.val, b.val, 8);
1070  v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
1071  return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
1072 }
1073 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1074 {
1075  vint32m2_t v1 = vundefined_i32m2();
1076  vint64m2_t v2 = vundefined_i64m2();
1077  v1 = vwmul_vv_i32m2(a.val, b.val, 8);
1078  v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
1079  return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
1080 }
1081 
1082 
1083 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
1084 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
1085 {\
1086  v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
1087  val = intrin(val, a.val, val, num); \
1088  return vmv_x_s_##len##m1_##len(val); \
1089 }
1090 
1091 
1092 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num, scalerfunc) \
1093 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
1094 {\
1095  v##_Tpvec##m1_t val = vundefined_##_Tpvec2##m1(); \
1096  val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num); \
1097  return scalerfunc(val); \
1098 }
1099 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
1100 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
1101 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64, int, sum, vwredsum_vs_i32m1_i64m1, 4)
1102 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16, unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16)
1103 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32, unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8)
1104 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4)
1105 inline float v_reduce_sum(const v_float32x4& a) \
1106 {\
1107  vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
1108  val = vfredosum_vs_f32m1_f32m1(val, a.val, val, 4); \
1109  return vfmv_f_s_f32m1_f32(val); \
1110 }
1111 inline double v_reduce_sum(const v_float64x2& a) \
1112 {\
1113  vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
1114  val = vfredosum_vs_f64m1_f64m1(val, a.val, val, 2); \
1115  return vfmv_f_s_f64m1_f64(val); \
1116 }
1117 inline uint64 v_reduce_sum(const v_uint64x2& a)
1118 { vuint64m1_t res = vundefined_u64m1(); return vmv_x_s_u64m1_u64(vredsum_vs_u64m1_u64m1(res, a.val, vmv_v_x_u64m1(0, 2), 2)); }
1119 
1120 inline int64 v_reduce_sum(const v_int64x2& a)
1121 { vint64m1_t res = vundefined_i64m1(); return vmv_x_s_i64m1_i64(vredsum_vs_i64m1_i64m1(res, a.val, vmv_v_x_i64m1(0, 2), 2)); }
1122 
1123 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func) \
1124 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16, vmv_x_s_i8m1_i8) \
1125 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8, vmv_x_s_i16m1_i16) \
1126 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4, vmv_x_s_i32m1_i32) \
1127 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2, vmv_x_s_i64m1_i64) \
1128 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16, vmv_x_s_u8m1_u8) \
1129 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8, vmv_x_s_u16m1_u16) \
1130 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4, vmv_x_s_u32m1_u32) \
1131 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4, vfmv_f_s_f32m1_f32)
1132 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
1133 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
1134 
1135 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1136  const v_float32x4& c, const v_float32x4& d)
1137 {
1138  vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
1139  vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
1140  vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
1141  vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
1142  a0 = vfredosum_vs_f32m1_f32m1(a0, a.val, a0, 4);
1143  b0 = vfredosum_vs_f32m1_f32m1(b0, b.val, b0, 4);
1144  c0 = vfredosum_vs_f32m1_f32m1(c0, c.val, c0, 4);
1145  d0 = vfredosum_vs_f32m1_f32m1(d0, d.val, d0, 4);
1146  vfloat32m1_t res;
1147  res = vslideup_vx_f32m1(a0, b0, 1, 4);
1148  res = vslideup_vx_f32m1(res, c0, 2, 4);
1149  res = vslideup_vx_f32m1(res, d0, 3, 4);
1150  return v_float32x4(res);
1151 }
1152 
1153 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1154 {
1155  vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
1156  vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
1157  vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
1158  vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
1159  a0 = vfredosum_vs_f32m1_f32m1(a0, val, a0, 4);
1160  return vfmv_f_s_f32m1_f32(a0);
1161 }
1162 
1163 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
1164 inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){ \
1165  _Tpvec2 x = v_absdiff(a, b); \
1166  return v_reduce_sum(x); \
1167 }
1168 
1169 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int8x16, v_uint8x16)
1170 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint8x16, v_uint8x16)
1171 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int16x8, v_uint16x8)
1172 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint16x8, v_uint16x8)
1173 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
1174 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
1175 
1176 #define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
1177 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1178 { \
1179  vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
1180  return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1181 } \
1182 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1183 { \
1184  vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
1185  return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1186 } \
1187 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1188 { \
1189  vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
1190  return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1191 } \
1192 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1193 { \
1194  vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
1195  return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1196 } \
1197 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1198 { \
1199  vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
1200  return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1201 } \
1202 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1203 { \
1204  vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
1205  return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1206 } \
1207 
1208 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int8x16, i8m1, 8, 16, _vv_)
1209 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int16x8, i16m1, 16, 8, _vv_)
1210 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int32x4, i32m1, 32, 4, _vv_)
1211 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int64x2, i64m1, 64, 2, _vv_)
1212 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint8x16, u8m1, 8, 16, u_vv_)
1213 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint16x8, u16m1, 16, 8, u_vv_)
1214 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
1215 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
1216 
1217 //TODO: ==
1218 inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
1219 {
1220  vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
1221  vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1222  return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1223 }
1224 inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
1225 {
1226  vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
1227  vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1228  return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1229 }
1230 inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
1231 {
1232  vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
1233  vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1234  return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1235 }
1236 inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
1237 {
1238  vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
1239  vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1240  return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1241 }
1242 inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
1243 {
1244  vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
1245  vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1246  return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1247 }
1248 inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
1249 {
1250  vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
1251  vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1252  return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1253 }
1254 inline v_float32x4 v_not_nan(const v_float32x4& a)
1255 {
1256  vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, a.val, 4);
1257  vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1258  return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
1259 }
1260 
1261 //TODO: ==
1262 inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
1263 {
1264  vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
1265  vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1266  return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1267 }
1268 inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
1269 {
1270  vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
1271  vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1272  return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1273 }
1274 inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
1275 {
1276  vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
1277  vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1278  return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1279 }
1280 inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
1281 {
1282  vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
1283  vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1284  return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1285 }
1286 inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
1287 {
1288  vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
1289  vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1290  return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1291 }
1292 inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
1293 {
1294  vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
1295  vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1296  return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1297 }
1298 inline v_float64x2 v_not_nan(const v_float64x2& a)
1299 {
1300  vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, a.val, 2);
1301  vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1302  return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
1303 }
1304 #define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
1305 inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
1306  const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
1307  v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
1308  v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
1309 { \
1310  vuint32m4_t vindex = vundefined_u32m4(); \
1311  vuint32m1_t vindex0 = vid_v_u32m1(4); \
1312  vindex0 = vsll_vx_u32m1(vindex0, 2, 4); \
1313  vindex = vset_v_u32m1_u32m4(vindex, 0, vindex0); \
1314  vindex = vset_v_u32m1_u32m4(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); \
1315  vindex = vset_v_u32m1_u32m4(vindex, 2, vadd_vx_u32m1(vindex0, 2, 4)); \
1316  vindex = vset_v_u32m1_u32m4(vindex, 3, vadd_vx_u32m1(vindex0, 3, 4)); \
1317  v##_Tp##32m4_t val = vundefined_##_T##m4(); \
1318  val = vset_v_##_T##m1_##_T##m4(val, 0, a0.val); \
1319  val = vset_v_##_T##m1_##_T##m4(val, 1, a1.val); \
1320  val = vset_v_##_T##m1_##_T##m4(val, 2, a2.val); \
1321  val = vset_v_##_T##m1_##_T##m4(val, 3, a3.val); \
1322  val = vrgather_vv_##_T##m4(val, vindex, 16); \
1323  b0.val = vget_v_##_T##m4_##_T##m1(val, 0); \
1324  b1.val = vget_v_##_T##m4_##_T##m1(val, 1); \
1325  b2.val = vget_v_##_T##m4_##_T##m1(val, 2); \
1326  b3.val = vget_v_##_T##m4_##_T##m1(val, 3); \
1327 }
1328 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
1329 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
1330 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
1331 
1332 
1333 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
1334 inline _Tpvec operator << (const _Tpvec& a, int n) \
1335 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
1336 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1337 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
1338 
1339 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
1340 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1341 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
1342 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1343 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
1344 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1345 { return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
1346 
1347 // trade efficiency for convenience
1348 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
1349 OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
1350 OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
1351 
1352 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
1353 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
1354 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
1355 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl)
1356 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
1357 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
1358 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
1359 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra)
1360 
1361 #if 0
1362 #define VUP4(n) {0, 1, 2, 3}
1363 #define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
1364 #define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
1365 #define VUP2(n) {0, 1}
1366 #endif
1367 #define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
1368 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1369 { \
1370  suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
1371  tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
1372  return _Tpvec(tmp);\
1373 } \
1374 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1375 { \
1376  suffix##m1_t res = vundefined_##_T##m1(); \
1377  return _Tpvec(vslidedown_vx_##_T##m1(res, a.val, n, num));\
1378 } \
1379 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1380 { return a; } \
1381 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1382 { \
1383  suffix##m2_t tmp = vundefined_##_T##m2(); \
1384  suffix##m2_t res = vundefined_##_T##m2(); \
1385  tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a.val); \
1386  tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, b.val); \
1387  res = vslidedown_vx_##_T##m2(res, tmp, n, num2);\
1388  return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 0));\
1389 } \
1390 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1391 { \
1392  suffix##m2_t tmp = vundefined_##_T##m2(); \
1393  suffix##m2_t res = vundefined_##_T##m2(); \
1394  tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, b.val); \
1395  tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a.val); \
1396  res = vslideup_vx_##_T##m2(res, tmp, n, num2);\
1397  return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 1));\
1398 } \
1399 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1400 { \
1401  CV_UNUSED(b); return a; \
1402 }
1403 
1404 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
1405 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
1406 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
1407 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
1408 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
1409 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
1410 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
1411 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
1412 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
1413 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
1414 
1415 #if 1
1416 #define vreinterpret_v_i8m1_i8m1
1417 #define vreinterpret_v_u8m1_u8m1
1418 #define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize, ldst_len, ldst_type) \
1419 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1420 { \
1421  _Tp2##_t res = vundefined_##len(); \
1422  _Tp2##_t res1 = vundefined_##len(); \
1423  res = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr0, 8)); \
1424  res1 = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr1, 8)); \
1425  res = vslideup_vx_##len(res, res1, hnum, num); \
1426  return _Tpvec(res); } \
1427 inline _Tpvec v_load_low(const _Tp* ptr) \
1428 { return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 8))); }\
1429 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1430 { return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
1431 inline _Tpvec v_load(const _Tp* ptr) \
1432 { return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1433 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1434 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
1435 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1436 { \
1437  _Tp2##_t a0 = vundefined_##len(); \
1438  a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1439  vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
1440 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1441 { vse##elemsize##_v_##len(ptr, a.val, num); } \
1442 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1443 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
1444 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1445 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
1446 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1447 { vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); }
1448 
1449 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8, u8m1, uchar)
1450 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16, schar, vint8m1, i8m1, 8, 16, 8, i8m1, schar)
1451 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16, u8m1, uchar)
1452 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8, short, vint16m1, i16m1, 4, 8, 16, i8m1, schar)
1453 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32, u8m1, uchar)
1454 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4, int, vint32m1, i32m1, 2, 4, 32, i8m1, schar)
1455 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64, u8m1, uchar)
1456 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2, long, vint64m1, i64m1, 1, 2, 64, i8m1, schar)
1457 
1458 #define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
1459 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1460 { \
1461  _Tp2##_t res = vundefined_##len(); \
1462  _Tp2##_t res1 = vundefined_##len(); \
1463  res = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr0, 8))); \
1464  res1 = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr1, 8))); \
1465  res = vslideup_vx_##len(res, res1, hnum, num); \
1466  return _Tpvec(res); } \
1467 inline _Tpvec v_load_low(const _Tp* ptr) \
1468 { return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 8)))); }\
1469 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1470 { return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
1471 inline _Tpvec v_load(const _Tp* ptr) \
1472 { return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1473 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1474 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
1475 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1476 { \
1477  _Tp2##_t a0 = vundefined_##len(); \
1478  a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1479  vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
1480 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1481 { vse##elemsize##_v_##len(ptr, a.val, num); } \
1482 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1483 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
1484 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1485 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
1486 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1487 { vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); }
1488 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
1489 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
1490 
1491 #else
1492 
1493 #define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
1494 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1495 { \
1496  _Tp2##_t res, res1; \
1497  res = vle##elemsize##_v_##len(ptr0, hnum); \
1498  res1 = vle##elemsize##_v_##len(ptr1, hnum); \
1499  res = vslideup_vx_##len(res, res1, hnum, num); \
1500  return _Tpvec(res); } \
1501 inline _Tpvec v_load_low(const _Tp* ptr) \
1502 { return _Tpvec(vle##elemsize##_v_##len(ptr, hnum)); }\
1503 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1504 { return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
1505 inline _Tpvec v_load(const _Tp* ptr) \
1506 { return _Tpvec((_Tp2##_t)vle##elemsize##_v_##len((const _Tp *)ptr, num)); } \
1507 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1508 { vse##elemsize##_v_##len(ptr, a.val, hnum);}\
1509 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1510 { \
1511  _Tp2##_t a0; \
1512  a0 = vslidedown_vx_##len(a0, a.val, hnum, num); \
1513  vse##elemsize##_v_##len(ptr, a0, hnum);}\
1514 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1515 { vse##elemsize##_v_##len(ptr, a.val, num); } \
1516 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1517 { vse##elemsize##_v_##len(ptr, a.val, num); } \
1518 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1519 { vse##elemsize##_v_##len(ptr, a.val, num); } \
1520 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1521 { vse##elemsize##_v_##len(ptr, a.val, num); }
1522 
1523 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8)
1524 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16, schar, vint8m1, i8m1, 8, 16, 8)
1525 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16)
1526 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8, short, vint16m1, i16m1, 4, 8, 16)
1527 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32)
1528 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4, int, vint32m1, i32m1, 2, 4, 32)
1529 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64)
1530 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2, long, vint64m1, i64m1, 1, 2, 64)
1531 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
1532 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
1533 
1534 #endif
1535 
1537 
1538 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1539 {
1540 #if 0
1541  schar CV_DECL_ALIGNED(32) elems[16] =
1542  {
1543  tab[idx[ 0]],
1544  tab[idx[ 1]],
1545  tab[idx[ 2]],
1546  tab[idx[ 3]],
1547  tab[idx[ 4]],
1548  tab[idx[ 5]],
1549  tab[idx[ 6]],
1550  tab[idx[ 7]],
1551  tab[idx[ 8]],
1552  tab[idx[ 9]],
1553  tab[idx[10]],
1554  tab[idx[11]],
1555  tab[idx[12]],
1556  tab[idx[13]],
1557  tab[idx[14]],
1558  tab[idx[15]]
1559  };
1560  return v_int8x16(vle8_v_i8m1(elems, 16));
1561 #else
1562 #if __riscv_v == 7000
1563  return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, vle32_v_u32m4((unsigned int *)idx, 16), 16), 0, 16), 0, 16));
1564 #else
1565  return v_int8x16(vloxei32_v_i8m1(tab, vle32_v_u32m4((unsigned int *)idx, 16), 16));
1566 #endif
1567 #endif
1568 }
1569 
1570 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
1571 #if 0
1572  schar CV_DECL_ALIGNED(32) elems[16] =
1573  {
1574  tab[idx[0]],
1575  tab[idx[0] + 1],
1576  tab[idx[1]],
1577  tab[idx[1] + 1],
1578  tab[idx[2]],
1579  tab[idx[2] + 1],
1580  tab[idx[3]],
1581  tab[idx[3] + 1],
1582  tab[idx[4]],
1583  tab[idx[4] + 1],
1584  tab[idx[5]],
1585  tab[idx[5] + 1],
1586  tab[idx[6]],
1587  tab[idx[6] + 1],
1588  tab[idx[7]],
1589  tab[idx[7] + 1]
1590  };
1591  return v_int8x16(vle8_v_i8m1(elems, 16));
1592 #else
1593  vuint32m4_t seq, index;
1594  vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 8);
1595  seq = vid_v_u32m4(16);
1596  index = vsrl_vx_u32m4(seq, 1, 16);
1597  vidx = vrgather_vv_u32m4(vidx, index, 16);
1598  index = vadd_vv_u32m4(vand_vx_u32m4(seq, 1, 16), vidx, 16);
1599 #if __riscv_v == 7000
1600  return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
1601 #else
1602  return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
1603 #endif
1604 #endif
1605 }
1606 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1607 {
1608 #if 0
1609  schar CV_DECL_ALIGNED(32) elems[16] =
1610  {
1611  tab[idx[0]],
1612  tab[idx[0] + 1],
1613  tab[idx[0] + 2],
1614  tab[idx[0] + 3],
1615  tab[idx[1]],
1616  tab[idx[1] + 1],
1617  tab[idx[1] + 2],
1618  tab[idx[1] + 3],
1619  tab[idx[2]],
1620  tab[idx[2] + 1],
1621  tab[idx[2] + 2],
1622  tab[idx[2] + 3],
1623  tab[idx[3]],
1624  tab[idx[3] + 1],
1625  tab[idx[3] + 2],
1626  tab[idx[3] + 3]
1627  };
1628  return v_int8x16(vle8_v_i8m1(elems, 16));
1629 #else
1630  vuint32m4_t seq, index;
1631  vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 4);
1632  seq = vid_v_u32m4(16);
1633  index = vsrl_vx_u32m4(seq, 2, 16);
1634  vidx = vrgather_vv_u32m4(vidx, index, 16);
1635  seq = vset_v_u32m1_u32m4(seq, 1, vget_v_u32m4_u32m1(seq, 0));
1636  seq = vset_v_u32m1_u32m4(seq, 2, vget_v_u32m4_u32m1(seq, 0));
1637  seq = vset_v_u32m1_u32m4(seq, 3, vget_v_u32m4_u32m1(seq, 0));
1638  index = vadd_vv_u32m4(seq, vidx, 16);
1639 #if __riscv_v == 7000
1640  return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
1641 #else
1642  return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
1643 #endif
1644 #endif
1645 }
1646 
1647 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1648 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1649 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1650 
1651 inline v_int16x8 v_lut(const short* tab, const int* idx)
1652 {
1653 #if 0
1654  short CV_DECL_ALIGNED(32) elems[8] =
1655  {
1656  tab[idx[0]],
1657  tab[idx[1]],
1658  tab[idx[2]],
1659  tab[idx[3]],
1660  tab[idx[4]],
1661  tab[idx[5]],
1662  tab[idx[6]],
1663  tab[idx[7]]
1664  };
1665  return v_int16x8(vle16_v_i16m1(elems, 8));
1666 #else
1667 #if __riscv_v == 7000
1668  return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8), 0, 8));
1669 #else
1670  return v_int16x8(vloxei32_v_i16m1(tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8));
1671 #endif
1672 #endif
1673 }
1674 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1675 {
1676 #if 0
1677  short CV_DECL_ALIGNED(32) elems[8] =
1678  {
1679  tab[idx[0]],
1680  tab[idx[0] + 1],
1681  tab[idx[1]],
1682  tab[idx[1] + 1],
1683  tab[idx[2]],
1684  tab[idx[2] + 1],
1685  tab[idx[3]],
1686  tab[idx[3] + 1]
1687  };
1688  return v_int16x8(vle16_v_i16m1(elems, 8));
1689 #else
1690  vuint32m2_t seq, index;
1691  vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 4);
1692  seq = vid_v_u32m2(8);
1693  index = vsrl_vx_u32m2(seq, 1, 8);
1694  vidx = vrgather_vv_u32m2(vidx, index, 8);
1695  index = vsll_vx_u32m2(vadd_vv_u32m2(vand_vx_u32m2(seq, 1, 8), vidx, 8), 1, 8);
1696 #if __riscv_v == 7000
1697  return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
1698 #else
1699  return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
1700 #endif
1701 #endif
1702 }
1703 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1704 {
1705 #if 0
1706  short CV_DECL_ALIGNED(32) elems[8] =
1707  {
1708  tab[idx[0]],
1709  tab[idx[0] + 1],
1710  tab[idx[0] + 2],
1711  tab[idx[0] + 3],
1712  tab[idx[1]],
1713  tab[idx[1] + 1],
1714  tab[idx[1] + 2],
1715  tab[idx[1] + 3]
1716  };
1717  return v_int16x8(vle16_v_i16m1(elems, 8));
1718 #else
1719  vuint32m2_t seq, index;
1720  vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 2);
1721  seq = vid_v_u32m2(8);
1722  index = vsrl_vx_u32m2(seq, 2, 8);
1723  vidx = vrgather_vv_u32m2(vidx, index, 8);
1724  seq = vset_v_u32m1_u32m2(seq, 1, vget_v_u32m2_u32m1(seq, 0));
1725  index = vsll_vx_u32m2(vadd_vv_u32m2(seq, vidx, 8), 1, 8);
1726 #if __riscv_v == 7000
1727  return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
1728 #else
1729  return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
1730 #endif
1731 #endif
1732 }
1733 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
1734 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
1735 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
1736 
1737 inline v_int32x4 v_lut(const int* tab, const int* idx)
1738 {
1739 #if 0
1740  int CV_DECL_ALIGNED(32) elems[4] =
1741  {
1742  tab[idx[0]],
1743  tab[idx[1]],
1744  tab[idx[2]],
1745  tab[idx[3]]
1746  };
1747  return v_int32x4(vle32_v_i32m1(elems, 4));
1748 #else
1749  return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
1750 #endif
1751 }
1752 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1753 {
1754 #if 0
1755  int CV_DECL_ALIGNED(32) elems[4] =
1756  {
1757  tab[idx[0]],
1758  tab[idx[0] + 1],
1759  tab[idx[1]],
1760  tab[idx[1] + 1]
1761  };
1762  return v_int32x4(vle32_v_i32m1(elems, 4));
1763 #else
1764  vuint32m1_t seq, index;
1765  vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
1766  seq = vid_v_u32m1(4);
1767  index = vsrl_vx_u32m1(seq, 1, 4);
1768  vidx = vrgather_vv_u32m1(vidx, index, 4);
1769  index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
1770  return v_int32x4(vloxei32_v_i32m1(tab, index, 4));
1771 #endif
1772 }
1773 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1774 {
1775  return v_int32x4(vle32_v_i32m1(tab+idx[0], 4));
1776 }
1777 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
1778 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
1779 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1780 
1781 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1782 {
1783  //vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1784  return v_int64x2(vloxei64_v_i64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
1785 }
1786 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1787 {
1788  return v_int64x2(vle64_v_i64m1(tab+idx[0], 2));
1789 }
1790 
1791 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
1792 {
1793  //vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1794  return v_uint64x2(vloxei64_v_u64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
1795 }
1796 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
1797 {
1798  return v_uint64x2(vle64_v_u64m1(tab+idx[0], 2));
1799 }
1800 
1801 inline v_float32x4 v_lut(const float* tab, const int* idx)
1802 {
1803 #if 0
1804  float CV_DECL_ALIGNED(32) elems[4] =
1805  {
1806  tab[idx[0]],
1807  tab[idx[1]],
1808  tab[idx[2]],
1809  tab[idx[3]]
1810  };
1811  return v_float32x4(vle32_v_f32m1(elems, 4));
1812 #else
1813  return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
1814 #endif
1815 }
1816 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1817 {
1818 #if 0
1819  float CV_DECL_ALIGNED(32) elems[4] =
1820  {
1821  tab[idx[0]],
1822  tab[idx[0]+1],
1823  tab[idx[1]],
1824  tab[idx[1]+1]
1825  };
1826  return v_float32x4(vle32_v_f32m1(elems, 4));
1827 #else
1828  vuint32m1_t seq, index;
1829  vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
1830  seq = vid_v_u32m1(4);
1831  index = vsrl_vx_u32m1(seq, 1, 4);
1832  vidx = vrgather_vv_u32m1(vidx, index, 4);
1833  index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
1834  return v_float32x4(vloxei32_v_f32m1(tab, index, 4));
1835 #endif
1836 }
1837 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1838 {
1839  return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
1840 }
1841 inline v_float64x2 v_lut(const double* tab, const int* idx)
1842 {
1843  //vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
1844  return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
1845 }
1846 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1847 {
1848  return v_float64x2(vle64_v_f64m1(tab+idx[0], 2));
1849 }
1850 
1851 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1852 {
1853  /*int CV_DECL_ALIGNED(32) elems[4] =
1854  {
1855  tab[idxvec.val[0]],
1856  tab[idxvec.val[1]],
1857  tab[idxvec.val[2]],
1858  tab[idxvec.val[3]]
1859  };*/
1860  return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1861 }
1862 
1863 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1864 {
1865  /*unsigned CV_DECL_ALIGNED(32) elems[4] =
1866  {
1867  tab[idxvec.val[0]],
1868  tab[idxvec.val[1]],
1869  tab[idxvec.val[2]],
1870  tab[idxvec.val[3]]
1871  };*/
1872  return v_uint32x4(vloxei32_v_u32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1873 }
1874 
1875 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1876 {
1877  /*float CV_DECL_ALIGNED(32) elems[4] =
1878  {
1879  tab[idxvec.val[0]],
1880  tab[idxvec.val[1]],
1881  tab[idxvec.val[2]],
1882  tab[idxvec.val[3]]
1883  };*/
1884  return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
1885 }
1886 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1887 {
1888  //vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
1889  return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vreinterpret_v_i64m1_u64m1(vget_v_i64m2_i64m1(vwadd_vx_i64m2(idxvec.val, 0, 2), 0)), 3, 2), 2));
1890 }
1891 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1892 {
1893  vint32m1_t index = vmul_vx_i32m1(idxvec.val, 4, 4);
1894  //vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
1895 
1896  //x.val = vlxe_v_f32m1(tab, index_x, 4);
1897  //y.val = vlxe_v_f32m1(tab, index_y, 4);
1898  vloxseg2ei32_v_f32m1(&x.val, &y.val, tab, vreinterpret_v_i32m1_u32m1(index), 4);
1899 }
1900 
1901 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1902 {
1903  int CV_DECL_ALIGNED(32) idx[4];
1904  v_store_aligned(idx, idxvec);
1905 
1906  x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1907  y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1908 }
1909 
1910 #define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type, elemsize) \
1911 inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1912 { \
1913  v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1914  tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1915  tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val); \
1916  return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
1917 }\
1918 template<int n> inline \
1919 v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1920 { \
1921  v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1922  tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1923  tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val); \
1924  return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
1925 }\
1926 inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1927 { \
1928  v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1929  tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1930  tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1931  asm("" ::: "memory"); \
1932  vse##elemsize##_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
1933 }\
1934 template<int n> inline \
1935 void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1936 { \
1937  v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1938  tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val); \
1939  tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1940  vse##elemsize##_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
1941 }
1942 OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_wx, vnclip_wx, signed char, 8)
1943 OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_wx, vnclip_wx, signed short, 16)
1944 OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_wx, vnsra_wx, int, 32)
1945 OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_wx, vnclipu_wx, unsigned char, 8)
1946 OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_wx, vnclipu_wx, unsigned short, 16)
1947 OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_wx, vnsrl_wx, unsigned int, 32)
1948 
1949 // pack boolean
1950 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1951 {
1952  vuint16m2_t tmp = vundefined_u16m2(); \
1953  tmp = vset_v_u16m1_u16m2(tmp, 0, a.val); \
1954  tmp = vset_v_u16m1_u16m2(tmp, 1, b.val); \
1955  return v_uint8x16(vnsrl_wx_u8m1(tmp, 0, 16));
1956 }
1957 
1958 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1959  const v_uint32x4& c, const v_uint32x4& d)
1960 {
1961  vuint32m4_t vabcd = vundefined_u32m4(); \
1962  vuint16m2_t v16 = vundefined_u16m2(); \
1963  vabcd = vset_v_u32m1_u32m4(vabcd, 0, a.val); \
1964  vabcd = vset_v_u32m1_u32m4(vabcd, 1, b.val); \
1965  vabcd = vset_v_u32m1_u32m4(vabcd, 2, c.val); \
1966  vabcd = vset_v_u32m1_u32m4(vabcd, 3, d.val); \
1967  v16 = vnsrl_wx_u16m2(vabcd, 0, 16);
1968  return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
1969 }
1970 
1971 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
1972  const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
1973  const v_uint64x2& g, const v_uint64x2& h)
1974 {
1975  vuint64m8_t v64 = vundefined_u64m8(); \
1976  vuint32m4_t v32 = vundefined_u32m4(); \
1977  vuint16m2_t v16 = vundefined_u16m2(); \
1978  v64 = vset_v_u64m1_u64m8(v64, 0, a.val); \
1979  v64 = vset_v_u64m1_u64m8(v64, 1, b.val); \
1980  v64 = vset_v_u64m1_u64m8(v64, 2, c.val); \
1981  v64 = vset_v_u64m1_u64m8(v64, 3, d.val); \
1982  v64 = vset_v_u64m1_u64m8(v64, 4, e.val); \
1983  v64 = vset_v_u64m1_u64m8(v64, 5, f.val); \
1984  v64 = vset_v_u64m1_u64m8(v64, 6, g.val); \
1985  v64 = vset_v_u64m1_u64m8(v64, 7, h.val); \
1986  v32 = vnsrl_wx_u32m4(v64, 0, 16);
1987  v16 = vnsrl_wx_u16m2(v32, 0, 16);
1988  return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
1989 }
1990 
1991 //inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
1992 //{ \
1993 // int16xm2_u tmp; \
1994 // tmp.m1[0] = (vint16m1_t)a.val; \
1995 // tmp.m1[1] = (vint16m1_t)b.val; \
1996 // e8xm1_t mask = (e8xm1_t)vmsge_vx_e16xm2_i16m2(tmp.v, 0, 16);\
1997 // return v_uint8x16(vnclipuvi_mask_u8m1_u16m2(vmv_v_x_u8m1(0, 16), (vuint16m2_t)tmp.v, 0, mask, 16));
1998 //}
1999 
2000 #define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
2001 inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
2002 { \
2003  vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2004  tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2005  tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val); \
2006  vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2007  return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1)); \
2008 } \
2009 inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
2010 { \
2011  vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2012  tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2013  vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2014  return vse##tp1##_v_u##tp1##m1(ptr, vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1), num2); \
2015 } \
2016 template<int n> inline \
2017 v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
2018 { \
2019  vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2020  tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2021  tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val); \
2022  vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2023  return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), n, num1)); \
2024 } \
2025 template<int n> inline \
2026 void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
2027 { \
2028  vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
2029  tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val); \
2030  vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
2031  vuint##tp1##m1_t val = vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val_), n, num1); \
2032  return vse##tp1##_v_u##tp1##m1(ptr, val, num2);\
2033 }
2034 OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
2035 OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
2036 
2037 
2038 // saturating multiply 8-bit, 16-bit
2039 #define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt) \
2040  inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
2041  { \
2042  auto res = mul(a.val, b.val, num); \
2043  return _Tpvec(cvt(res, 0, num)); \
2044  } \
2045  inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
2046  { a = a * b; return a; }
2047 
2048 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, 16, vwmul_vv_i16m2, vnclip_wx_i8m1)
2049 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
2050 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8, 32, vwmul_vv_i32m2, vnclip_wx_i16m1)
2051 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, 32, vwmulu_vv_u32m2, vnclipu_wx_u16m1)
2052 
2053 
2054 static const signed char popCountTable[256] =
2055 {
2056  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2057  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2058  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2059  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2060  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2061  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2062  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2063  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2064  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2065  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2066  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2067  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2068  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2069  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2070  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2071  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2072 };
2073 
2074 inline vuint8m1_t vcnt_u8(vuint8m1_t val){
2075 #if __riscv_v == 7000
2076  vuint8m1_t v0 = vand_vx_u8m1(val, 1, 16);
2077  return vadd_vv_u8m1(vloxei8_v_u8m1((unsigned char*)popCountTable, vsrl_vx_u8m1(val, 1, 16), 16), v0, 16);
2078 #else
2079  return vloxei8_v_u8m1((unsigned char*)popCountTable, val, 16);
2080 #endif
2081 }
2082 
2083 inline v_uint8x16
2084 v_popcount(const v_uint8x16& a)
2085 {
2086  return v_uint8x16(vcnt_u8(a.val));
2087 }
2088 
2089 inline v_uint8x16
2090 v_popcount(const v_int8x16& a)
2091 {
2092  return v_uint8x16(vcnt_u8(vreinterpret_v_i8m1_u8m1(a.val)));
2093 }
2094 
2095 inline v_uint16x8
2096 v_popcount(const v_uint16x8& a)
2097 {
2098  vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u16m1_u8m1(a.val));
2099  vuint8m1_t seq = vid_v_u8m1(8);
2100  vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
2101  return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
2102 }
2103 
2104 inline v_uint16x8
2105 v_popcount(const v_int16x8& a)
2106 {
2107  vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(a.val)));
2108  vuint8m1_t seq = vid_v_u8m1(8);
2109  vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
2110  return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
2111 }
2112 
2113 inline v_uint32x4
2114 v_popcount(const v_uint32x4& a)
2115 {
2116  vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u32m1_u8m1(a.val));
2117  vuint8m1_t seq = vid_v_u8m1(8);
2118  vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
2119  vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
2120  return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
2121 }
2122 
2123 inline v_uint32x4
2124 v_popcount(const v_int32x4& a)
2125 {
2126  vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(a.val)));
2127  vuint8m1_t seq = vid_v_u8m1(8);
2128  vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
2129  vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
2130  return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
2131 }
2132 
2133 inline v_uint64x2
2134 v_popcount(const v_uint64x2& a)
2135 {
2136  vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u64m1_u8m1(a.val));
2137  vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
2138  vuint16m1_t res1 = vundefined_u16m1();
2139  vuint16m1_t res2 = vundefined_u16m1();
2140  res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
2141  res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
2142  return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
2143 }
2144 
2145 inline v_uint64x2
2146 v_popcount(const v_int64x2& a)
2147 {
2148  vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(a.val)));
2149  vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
2150  vuint16m1_t res1 = vundefined_u16m1(), res2 = vundefined_u16m1();
2151  res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
2152  res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
2153  return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
2154 }
2155 
2156 #define SMASK 1, 2, 4, 8, 16, 32, 64, 128
2157 inline int v_signmask(const v_uint8x16& a)
2158 {
2159  vuint16m1_t res = vundefined_u16m1();
2160  vuint8m1_t id = vid_v_u8m1(16);
2161  vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
2162  vuint8m1_t t0 = vsrl_vx_u8m1(a.val, 7, 16);
2163  vbool8_t mask = vmseq_vx_u8m1_b8(t0, 1, 16);
2164  res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2165  return vmv_x_s_u16m1_u16(res);
2166 }
2167 inline int v_signmask(const v_int8x16& a)
2168 {
2169  vuint16m1_t res = vundefined_u16m1();
2170  vuint8m1_t id = vid_v_u8m1(16);
2171  vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
2172  vbool8_t mask = vmslt_vx_i8m1_b8(a.val, 0, 16);
2173  res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2174  return vmv_x_s_u16m1_u16(res);
2175 }
2176 
2177 inline int v_signmask(const v_int16x8& a)
2178 {
2179  vuint16m1_t res = vundefined_u16m1();
2180  vuint16m1_t id = vid_v_u16m1(8);
2181  vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
2182  vbool16_t mask = vmslt_vx_i16m1_b16(a.val, 0, 8);
2183  res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
2184  return vmv_x_s_u16m1_u16(res);
2185 }
2186 inline int v_signmask(const v_uint16x8& a)
2187 {
2188  vuint16m1_t res = vundefined_u16m1();
2189  vuint16m1_t id = vid_v_u16m1(8);
2190  vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
2191  vuint16m1_t t0 = vsrl_vx_u16m1(a.val, 15, 8);
2192  vbool16_t mask = vmseq_vx_u16m1_b16(t0, 1, 8);
2193  res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 8);
2194  return vmv_x_s_u16m1_u16(res);
2195 }
2196 inline int v_signmask(const v_int32x4& a)
2197 {
2198  vuint32m1_t res = vundefined_u32m1();
2199  vuint32m1_t id = vid_v_u32m1(4);
2200  vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
2201  vbool32_t mask = vmslt_vx_i32m1_b32(a.val, 0, 4);
2202  res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
2203  return vmv_x_s_u32m1_u32(res);
2204 }
2205 inline int v_signmask(const v_uint32x4& a)
2206 {
2207  vuint32m1_t res = vundefined_u32m1();
2208  vuint32m1_t id = vid_v_u32m1(4);
2209  vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
2210  vuint32m1_t t0 = vsrl_vx_u32m1(a.val, 31, 4);
2211  vbool32_t mask = vmseq_vx_u32m1_b32(t0, 1, 4);
2212  res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
2213  return vmv_x_s_u32m1_u32(res);
2214 }
2215 inline int v_signmask(const v_uint64x2& a)
2216 {
2217  vuint64m1_t res = vundefined_u64m1();
2218  vuint64m1_t id = vid_v_u64m1(2);
2219  vuint64m1_t num = vsll_vv_u64m1(vmv_v_x_u64m1(1, 2), id, 2);
2220  vuint64m1_t t0 = vsrl_vx_u64m1(a.val, 63, 2);
2221  vbool64_t mask = vmseq_vx_u64m1_b64(t0, 1, 2);
2222  res = vredsum_vs_u64m1_u64m1_m(mask, res, num, vmv_v_x_u64m1(0, 2), 2);
2223  return vmv_x_s_u64m1_u64(res);
2224 }
2225 inline int v_signmask(const v_int64x2& a)
2226 { return v_signmask(v_reinterpret_as_u64(a)); }
2227 inline int v_signmask(const v_float64x2& a)
2228 { return v_signmask(v_reinterpret_as_u64(a)); }
2229 inline int v_signmask(const v_float32x4& a)
2230 {
2231  return v_signmask(v_reinterpret_as_u32(a));
2232  /*
2233  vuint32m1_t res;
2234  vuint32m1_t id = vid_v_u32m1(4);
2235  vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
2236  vbool32_t mask = vmflt_vf_f32m1_b32(a.val, 0, 4);
2237  res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
2238  return vmv_x_s_u32m1_u32(res);*/
2239 }
2240 
2241 inline int v_scan_forward(const v_int8x16& a) {
2242 int val = v_signmask(a);
2243 if(val==0) return 0;
2244 else return trailingZeros32(val); }
2245 inline int v_scan_forward(const v_uint8x16& a) {
2246 int val = v_signmask(a);
2247 if(val==0) return 0;
2248 else return trailingZeros32(val); }
2249 inline int v_scan_forward(const v_int16x8& a) {
2250 int val = v_signmask(a);
2251 if(val==0) return 0;
2252 else return trailingZeros32(val); }
2253 inline int v_scan_forward(const v_uint16x8& a) {
2254 int val = v_signmask(a);
2255 if(val==0) return 0;
2256 else return trailingZeros32(val); }
2257 inline int v_scan_forward(const v_int32x4& a) {
2258 int val = v_signmask(a);
2259 if(val==0) return 0;
2260 else return trailingZeros32(val); }
2261 inline int v_scan_forward(const v_uint32x4& a) {
2262 int val = v_signmask(a);
2263 if(val==0) return 0;
2264 else return trailingZeros32(val); }
2265 inline int v_scan_forward(const v_float32x4& a) {
2266 int val = v_signmask(a);
2267 if(val==0) return 0;
2268 else return trailingZeros32(val); }
2269 inline int v_scan_forward(const v_int64x2& a) {
2270 int val = v_signmask(a);
2271 if(val==0) return 0;
2272 else return trailingZeros32(val); }
2273 inline int v_scan_forward(const v_uint64x2& a) {
2274 int val = v_signmask(a);
2275 if(val==0) return 0;
2276 else return trailingZeros32(val); }
2277 
2278 #define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num, mask_b) \
2279 inline bool v_check_all(const v_##_Tpvec& a) \
2280 { \
2281  suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
2282  return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) == 0; \
2283 } \
2284 inline bool v_check_any(const v_##_Tpvec& a) \
2285 { \
2286  suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
2287  return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) != 0; \
2288 }
2289 
2290 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16, b8)
2291 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8, b16)
2292 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4, b32)
2293 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2, b64)
2294 
2295 inline bool v_check_all(const v_int8x16& a)
2296 { return v_check_all(v_reinterpret_as_u8(a)); }
2297 inline bool v_check_all(const v_int16x8& a)
2298 { return v_check_all(v_reinterpret_as_u16(a)); }
2299 inline bool v_check_all(const v_int32x4& a)
2300 { return v_check_all(v_reinterpret_as_u32(a)); }
2301 inline bool v_check_all(const v_float32x4& a)
2302 { return v_check_all(v_reinterpret_as_u32(a)); }
2303 inline bool v_check_all(const v_int64x2& a)
2304 { return v_check_all(v_reinterpret_as_u64(a)); }
2305 inline bool v_check_all(const v_float64x2& a)
2306 { return v_check_all(v_reinterpret_as_u64(a)); }
2307 
2308 inline bool v_check_any(const v_int8x16& a)
2309 { return v_check_any(v_reinterpret_as_u8(a)); }
2310 inline bool v_check_any(const v_int16x8& a)
2311 { return v_check_any(v_reinterpret_as_u16(a)); }
2312 inline bool v_check_any(const v_int32x4& a)
2313 { return v_check_any(v_reinterpret_as_u32(a)); }
2314 inline bool v_check_any(const v_float32x4& a)
2315 { return v_check_any(v_reinterpret_as_u32(a)); }
2316 inline bool v_check_any(const v_int64x2& a)
2317 { return v_check_any(v_reinterpret_as_u64(a)); }
2318 inline bool v_check_any(const v_float64x2& a)
2319 { return v_check_any(v_reinterpret_as_u64(a)); }
2320 
2321 #define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num, mask_func) \
2322 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
2323 { \
2324  return _Tpvec(vmerge_vvm_##suffix(mask_func(mask.val, 0, num), b.val, a.val, num)); \
2325 }
2326 
2327 OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16, i8m1, vbool8_t, 16, vmsne_vx_i8m1_b8)
2328 OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8, i16m1, vbool16_t, 8, vmsne_vx_i16m1_b16)
2329 OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4, i32m1, vbool32_t, 4, vmsne_vx_i32m1_b32)
2330 OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16, vmsne_vx_u8m1_b8)
2331 OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8, vmsne_vx_u16m1_b16)
2332 OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4, vmsne_vx_u32m1_b32)
2333 inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
2334 {
2335  return v_float32x4(vmerge_vvm_f32m1(vmfne_vf_f32m1_b32(mask.val, 0, 4), b.val, a.val, 4));
2336 }
2337 inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
2338 {
2339  return v_float64x2(vmerge_vvm_f64m1(vmfne_vf_f64m1_b64(mask.val, 0, 2), b.val, a.val, 2));
2340 }
2341 
2342 #define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2, num3) \
2343 inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
2344 { \
2345  _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1); \
2346  b0.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 0); \
2347  b1.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 1); \
2348 } \
2349 inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
2350 { \
2351  _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num2); \
2352  return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
2353 } \
2354 inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
2355 { \
2356  _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1); \
2357  return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 1)); \
2358 } \
2359 inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
2360 { \
2361  _T2##_t val = vle##num3##_v_##_Tp1(ptr, num2); \
2362  _T1##_t b = vw##add##_vx_##_Tp2##m2(val, 0, num2); \
2363  return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
2364 }
2365 
2366 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1, 8)
2367 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1, 16)
2368 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1, 32)
2369 OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar, i8m1, 16, i16, 8, vint16m2, vint8m1, 8)
2370 OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short, i16m1, 8, i32, 4, vint32m2, vint16m1, 16)
2371 OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int, i32m1, 4, i64, 2, vint64m2, vint32m1, 32)
2372 
2373 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
2374 {
2375  vuint16m2_t b = vundefined_u16m2();
2376  vuint32m2_t c = vundefined_u32m2();
2377  vuint8m1_t val = vle8_v_u8m1(ptr, 4); \
2378  b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4); \
2379  c = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \
2380  return v_uint32x4(vget_v_u32m2_u32m1(c, 0));
2381 }
2382 
2383 inline v_int32x4 v_load_expand_q(const schar* ptr)
2384 {
2385  vint16m2_t b = vundefined_i16m2();
2386  vint32m2_t c = vundefined_i32m2();
2387  vint8m1_t val = vle8_v_i8m1(ptr, 4); \
2388  b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4); \
2389  c = vwadd_vv_i32m2(vget_v_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \
2390  return v_int32x4(vget_v_i32m2_i32m1(c, 0));
2391 }
2392 #define VITL_16 {0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
2393 #define VITL_8 {0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
2394 #define VITL_4 {0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
2395 #define VITL_2 {0, 0, 2, 0, 1, 0, 3, 0}
2396 
2397 #define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh, refunc) \
2398 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2399 { \
2400  v##_Tp##m2_t tmp = vundefined_##_T##m2();\
2401  tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a0.val); \
2402  tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a1.val); \
2403  unsigned mdata[] = VITL_##num; \
2404  vuint32m2_t mask = vle32_v_u32m2(mdata, 8); \
2405  tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, refunc(mask), num2); \
2406  b0.val = vget_v_##_T##m2_##_T##m1(tmp, 0); \
2407  b1.val = vget_v_##_T##m2_##_T##m1(tmp, 1); \
2408 } \
2409 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2410 { \
2411  v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2412  return v_##_Tpvec(b0);\
2413 } \
2414 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2415 { \
2416  v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
2417  v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
2418  v##_Tp##m1_t b1 = vundefined_##_T##m1(); \
2419  b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num); \
2420  a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num); \
2421  b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2422  return v_##_Tpvec(b1);\
2423 } \
2424 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2425 { \
2426  v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
2427  v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
2428  c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2429  b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num); \
2430  a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num); \
2431  d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2432 }
2433 
2434 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
2435 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
2436 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
2437 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
2438 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2,)
2439 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2,)
2440 OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2,)
2441 OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1, vreinterpret_v_u32m2_u64m2)
2442 
2443 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
2444 {
2445  return v_uint8x16(vrgather_vv_u8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
2446 }
2447 inline v_int8x16 v_reverse(const v_int8x16 &a)
2448 {
2449  return v_int8x16(vrgather_vv_i8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
2450 }
2451 
2452 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
2453 {
2454  return v_uint16x8(vrgather_vv_u16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
2455 }
2456 
2457 inline v_int16x8 v_reverse(const v_int16x8 &a)
2458 {
2459  return v_int16x8(vrgather_vv_i16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
2460 }
2461 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
2462 {
2463  return v_uint32x4(vrgather_vv_u32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
2464 }
2465 
2466 inline v_int32x4 v_reverse(const v_int32x4 &a)
2467 {
2468  return v_int32x4(vrgather_vv_i32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
2469 }
2470 
2471 inline v_float32x4 v_reverse(const v_float32x4 &a)
2472 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
2473 
2474 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
2475 {
2476  return v_uint64x2(vrgather_vv_u64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2477 }
2478 
2479 inline v_int64x2 v_reverse(const v_int64x2 &a)
2480 {
2481  return v_int64x2(vrgather_vv_i64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2482 }
2483 
2484 inline v_float64x2 v_reverse(const v_float64x2 &a)
2485 {
2486  return v_float64x2(vrgather_vv_f64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
2487 }
2488 
2489 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
2490 template <int n> \
2491 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2492 { return v_rotate_right<n>(a, b);}
2493 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint8x16, u8, 0)
2494 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int8x16, s8, 0)
2495 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint16x8, u16, 1)
2496 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int16x8, s16, 1)
2497 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint32x4, u32, 2)
2498 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int32x4, s32, 2)
2499 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint64x2, u64, 3)
2500 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int64x2, s64, 3)
2501 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
2502 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
2503 
2504 
2505 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix, vtype, _vtype, num, mvfunc) \
2506 template<int i> inline _Tp v_extract_n(_Tpvec v) { vtype tmp = vundefined_##_vtype(); return mvfunc(vslidedown_vx_##_vtype(tmp, v.val, i, num)); }
2507 
2508 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8, vuint8m1_t, u8m1, 16, vmv_x_s_u8m1_u8)
2509 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8, vint8m1_t, i8m1, 16, vmv_x_s_i8m1_i8)
2510 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16, vuint16m1_t, u16m1, 8, vmv_x_s_u16m1_u16)
2511 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16, vint16m1_t, i16m1, 8, vmv_x_s_i16m1_i16)
2512 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32, vuint32m1_t, u32m1, 4, vmv_x_s_u32m1_u32)
2513 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32, vint32m1_t, i32m1, 4, vmv_x_s_i32m1_i32)
2514 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64, vuint64m1_t, u64m1, 2, vmv_x_s_u64m1_u64)
2515 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64, vint64m1_t, i64m1, 2, vmv_x_s_i64m1_i64)
2516 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32, vfloat32m1_t, f32m1, 4, vfmv_f_s_f32m1_f32)
2517 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64, vfloat64m1_t, f64m1, 2, vfmv_f_s_f64m1_f64)
2518 
2519 #define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
2520 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
2521 
2522 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint8x16, u8, 16)
2523 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int8x16, i8, 16)
2524 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint16x8, u16, 8)
2525 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int16x8, i16, 8)
2526 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint32x4, u32, 4)
2527 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
2528 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
2529 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
2530 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
2531 
2532 inline void __builtin_riscv_fsrm(int val)
2533 {
2534  asm("csrw frm, %0\n\t"
2535  :
2536  :"r"(val));
2537  return;
2538 }
2539 
2540 inline void barrier1(void *arg) {
2541  __asm__ __volatile__("" : : "r" (arg) : "memory");
2542 }
2543 
2544 inline v_int32x4 v_round(const v_float32x4& a)
2545 {
2546  __builtin_riscv_fsrm(0);
2547  vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2548  barrier1(&nan);
2549  vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2550  vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2551  __builtin_riscv_fsrm(0);
2552  return v_int32x4(val);
2553 }
2554 inline v_int32x4 v_floor(const v_float32x4& a)
2555 {
2556  __builtin_riscv_fsrm(2);
2557  vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2558  barrier1(&nan);
2559  vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2560  vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2561  __builtin_riscv_fsrm(0);
2562  return v_int32x4(val);
2563 }
2564 
2565 inline v_int32x4 v_ceil(const v_float32x4& a)
2566 {
2567  __builtin_riscv_fsrm(3);
2568  vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2569  barrier1(&nan);
2570  vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2571  vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2572  __builtin_riscv_fsrm(0);
2573  return v_int32x4(val);
2574 }
2575 
2576 inline v_int32x4 v_trunc(const v_float32x4& a)
2577 {
2578  __builtin_riscv_fsrm(1);
2579  vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
2580  barrier1(&nan);
2581  vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2582  vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2583  __builtin_riscv_fsrm(0);
2584  return v_int32x4(val);
2585 }
2586 
2587 inline v_int32x4 v_round(const v_float64x2& a)
2588 {
2589  __builtin_riscv_fsrm(0);
2590  vfloat64m2_t _val = vundefined_f64m2();
2591  _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2592  //_val = vset_f64m2(_val, 1, a.val);
2593  _val = vset_v_f64m1_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
2594  barrier1(&_val);
2595  vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
2596  __builtin_riscv_fsrm(0);
2597  return v_int32x4(val);
2598 }
2599 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2600 {
2601  __builtin_riscv_fsrm(0);
2602  vfloat64m2_t _val = vundefined_f64m2();
2603  _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2604  _val = vset_v_f64m1_f64m2(_val, 1, b.val);
2605  barrier1(&_val);
2606  vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
2607  __builtin_riscv_fsrm(0);
2608  return v_int32x4(val);
2609 }
2610 inline v_int32x4 v_floor(const v_float64x2& a)
2611 {
2612  __builtin_riscv_fsrm(2);
2613  vfloat64m2_t _val = vundefined_f64m2();
2614  _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2615  vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2616  vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2617  barrier1(&nan);
2618  vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2619  vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2620  __builtin_riscv_fsrm(0);
2621  return v_int32x4(val);
2622 }
2623 
2624 inline v_int32x4 v_ceil(const v_float64x2& a)
2625 {
2626  __builtin_riscv_fsrm(3);
2627  vfloat64m2_t _val = vundefined_f64m2();
2628  _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2629  vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2630  vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2631  barrier1(&nan);
2632  vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2633  vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2634  __builtin_riscv_fsrm(0);
2635  return v_int32x4(val);
2636 }
2637 
2638 inline v_int32x4 v_trunc(const v_float64x2& a)
2639 {
2640  __builtin_riscv_fsrm(1);
2641  vfloat64m2_t _val = vundefined_f64m2();
2642  _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2643  vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2644  vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
2645  barrier1(&nan);
2646  vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2647  vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2648  __builtin_riscv_fsrm(0);
2649  return v_int32x4(val);
2650 }
2651 
2652 #define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize) \
2653 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2654 { \
2655  intrin##2e##elemsize##_v_##_T##m1(&a.val, &b.val, ptr, num); \
2656 } \
2657 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2658 { \
2659  intrin##3e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num); \
2660 }\
2661 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2662  v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2663 { \
2664  intrin##4e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num); \
2665 } \
2666 
2667 #define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize) \
2668 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2669  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2670 { \
2671  intrin##2e##elemsize##_v_##_T##m1(ptr, a.val, b.val, num); \
2672 } \
2673 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2674  const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2675 { \
2676  intrin##3e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, num); \
2677 } \
2678 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2679  const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2680  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2681 { \
2682  intrin##4e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num); \
2683 }
2684 
2685 #define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T, elemsize) \
2686 OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T, elemsize) \
2687 OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T, elemsize)
2688 
2689 //OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
2690 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8, 8)
2691 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16, 16)
2692 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32, 32)
2693 
2694 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8, 8)
2695 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16, 16)
2696 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32, 32)
2697 
2698 #define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T, _esize) \
2699 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2700 { vlseg2e##_esize##_v_##_T##m1(&a.val, &b.val, ptr, num);} \
2701 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2702 { vlseg3e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num);}\
2703 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2704  v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2705 { vlseg4e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num);} \
2706 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2707  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2708 { vsseg2e##_esize##_v_##_T##m1(ptr, a.val, b.val, num);} \
2709 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2710  const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2711 { vsseg3e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, num);} \
2712 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2713  const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2714  hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2715 { vsseg4e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num);}
2716 
2717 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32, 32)
2718 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64, 64)
2719 
2720 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64, 64)
2721 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64, 64)
2722 
2723 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2724 {
2725  return v_float32x4(vfcvt_f_x_v_f32m1(a.val, 4));
2726 }
2727 
2728 #if CV_SIMD128_64F
2729 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2730 {
2731  vfloat64m2_t _val = vundefined_f64m2();
2732  _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2733  vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
2734  return v_float32x4(aval);
2735 }
2736 
2737 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2738 {
2739  vfloat64m2_t _val = vundefined_f64m2();
2740  _val = vset_v_f64m1_f64m2(_val, 0, a.val);
2741  _val = vset_v_f64m1_f64m2(_val, 1, b.val);
2742  vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 4);
2743  return v_float32x4(aval);
2744 }
2745 
2746 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2747 {
2748  vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2749  vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2750  return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
2751 }
2752 
2753 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2754 {
2755  vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2756  vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2757  return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
2758 }
2759 
2760 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2761 {
2762  vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2763  return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
2764 }
2765 
2766 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2767 {
2768  vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2769  return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
2770 }
2771 
2772 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2773 {
2774  return v_float64x2(vfcvt_f_x_v_f64m1(a.val, 2));
2775 }
2776 
2777 #endif
2778 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2779 {
2780  uint64 mdata[2] = {0x0705060403010200, 0x0F0D0E0C0B090A08};
2781  vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2782  return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
2783 }
2784 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
2785 {
2786  return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec)));
2787 }
2788 
2789 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2790 {
2791  uint64 mdata[2] = {0x0703060205010400, 0x0F0B0E0A0D090C08};
2792  vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2793  return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
2794 }
2795 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
2796 {
2797  return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec)));
2798 }
2799 
2800 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2801 {
2802  uint64 mdata[2] = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
2803  vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2804  return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2805 }
2806 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
2807 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2808 {
2809  uint64 mdata[2] = {0x0B0A030209080100, 0x0F0E07060D0C0504};
2810  vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2811  return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2812 }
2813 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2814 
2815 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2816 {
2817  uint64 mdata[2] = {0x0B0A090803020100, 0x0F0E0D0C07060504};
2818  vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2819  return v_int32x4(vreinterpret_v_i8m1_i32m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2820 }
2821 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2822 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
2823 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2824 {
2825  uint64 mdata[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
2826  vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2827  return v_int8x16(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vec.val), vreinterpret_v_u64m1_u8m1(m0), 16)));
2828 }
2829 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2830 
2831 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2832 {
2833  uint64 mdata[2] = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
2834  vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
2835  return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
2836 }
2837 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2838 
2839 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2840 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2841 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2842 
2843 #if CV_SIMD128_64F
2844 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
2845 { return v_cvt_f64(v_dotprod(a, b)); }
2846 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
2847  const v_float64x2& c)
2848 { return v_dotprod_expand(a, b) + c; }
2849 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
2850 {
2851  vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
2852  vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2), 2);
2853  return v_float64x2(res);
2854 }
2855 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
2856 { v_float64x2 res = v_dotprod_expand_fast(a, b);
2857  return res + c; }
2858 #endif
2860 #if __riscv_v == 7000
2861 inline v_float32x4 v_load_expand(const hfloat* ptr)
2862 {
2863  vfloat16m1_t v = vle16_v_f16m1((__fp16*)ptr, 4);
2864  vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
2865  return v_float32x4(vget_v_f32m2_f32m1(v32, 0));
2866 }
2867 
2868 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2869 {
2870  vfloat32m2_t v32 = vundefined_f32m2();
2871  v32 = vset_v_f32m1_f32m2(v32, 0, v.val);
2872  vfloat16m1_t hv = vfncvt_f_f_w_f16m1(v32, 4);
2873  vse16_v_f16m1((__fp16*)ptr, hv, 4);
2874 }
2875 #else
2876 inline v_float32x4 v_load_expand(const hfloat* ptr)
2877 {
2878  vfloat16mf2_t v = vle16_v_f16mf2((__fp16*)ptr, 4);
2879  vfloat32m1_t v32 = vfwcvt_f_f_v_f32m1(v, 4);
2880  return v_float32x4(v32);
2881 }
2882 
2883 inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2884 {
2885  //vfloat32m2_t v32 = vundefined_f32m2();
2886  //v32 = vset_f32m2(v32, 0, v.val);
2887  vfloat16mf2_t hv = vfncvt_f_f_w_f16mf2(v.val, 4);
2888  vse16_v_f16mf2((__fp16*)ptr, hv, 4);
2889 }
2890 #endif
2891 
2892 inline void v_cleanup() {}
2893 
2894 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2895 
2897 
2898 }
2899 #endif
CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask=noArray(), int dtype=-1)
Calculates the per-element sum of two arrays or an array and a scalar.
static bool operator!=(const Matx< _Tp, m, n > &a, const Matx< _Tp, m, n > &b)
static bool operator==(const Matx< _Tp, m, n > &a, const Matx< _Tp, m, n > &b)
const int * idx
Definition: core_c.h:668
int index
Definition: core_c.h:634
const CvSeq * seq
Definition: core_c.h:1548
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr * y
Definition: core_c.h:1187
signed char schar
Definition: interface.h:48
uint32_t uint
Definition: interface.h:42
unsigned char uchar
Definition: interface.h:51
int64_t int64
Definition: interface.h:61
unsigned short ushort
Definition: interface.h:52
uint64_t uint64
Definition: interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition: intrin_cpp.hpp:1433
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition: intrin_cpp.hpp:1007
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2640
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition: intrin_cpp.hpp:2424
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition: intrin_cpp.hpp:1185
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition: intrin_cpp.hpp:491
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition: intrin_cpp.hpp:2534
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition: intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition: intrin_cpp.hpp:1392
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition: intrin_cpp.hpp:1233
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition: intrin_cpp.hpp:3193
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition: intrin_cpp.hpp:507
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2703
V_TypeTraits< typename V_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition: intrin_cpp.hpp:1374
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition: intrin_cpp.hpp:493
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition: intrin_cpp.hpp:2573
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2626
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition: intrin_cpp.hpp:1335
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition: intrin_cpp.hpp:1020
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition: intrin_cpp.hpp:1046
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition: intrin_cpp.hpp:1409
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition: intrin_cpp.hpp:2475
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition: intrin_cpp.hpp:499
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition: intrin_cpp.hpp:890
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition: intrin_cpp.hpp:1353
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition: intrin_cpp.hpp:828
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition: intrin_cpp.hpp:1033
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2716
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition: intrin_cpp.hpp:497
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition: intrin_cpp.hpp:1961
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition: intrin_cpp.hpp:2733
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition: intrin_cpp.hpp:1057
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition: intrin_cpp.hpp:2449
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition: intrin_cpp.hpp:2343
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition: intrin_cpp.hpp:1216
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition: intrin_cpp.hpp:1142
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition: intrin_cpp.hpp:3289
void v_cleanup()
Definition: intrin_cpp.hpp:3297
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition: intrin_cpp.hpp:3223
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition: intrin_cpp.hpp:2681
CV_INLINE v_reg< _Tp, n > & operator/=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition: intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition: intrin_cpp.hpp:1872
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition: intrin_cpp.hpp:2462
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition: intrin_cpp.hpp:1077
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition: intrin_cpp.hpp:501
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition: intrin_cpp.hpp:1421
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition: intrin_cpp.hpp:953
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition: intrin_cpp.hpp:994
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition: intrin_cpp.hpp:1451
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition: intrin_cpp.hpp:2584
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition: intrin_cpp.hpp:1116
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition: intrin_cpp.hpp:2251
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition: intrin_cpp.hpp:3111
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition: intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition: intrin_cpp.hpp:503
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition: intrin_cpp.hpp:2633
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
#define CV_DECL_ALIGNED(x)
Definition: cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
OutputArray sum
Definition: imgproc.hpp:2882
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441
static bool operator<(const FileNodeIterator &it1, const FileNodeIterator &it2)
Definition: persistence.hpp:1303
T nan(T... args)
_Tp get0() const
Access first value.
Definition: intrin_cpp.hpp:437