EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
intrin_rvv.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html.
4
5// The original implementation has been contributed by Yin Zhang.
6// Copyright (C) 2020, Institute of Software, Chinese Academy of Sciences.
7
8#ifndef OPENCV_HAL_INTRIN_RVV_HPP
9#define OPENCV_HAL_INTRIN_RVV_HPP
10
11#include <algorithm>
12
13// RVV intrinsics have been renamed in version 0.11, so we need to include
14// compatibility headers:
15// https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/auto-generated/rvv-v0p10-compatible-headers
16#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>10999
17#include "intrin_rvv_010_compat_non-policy.hpp"
18#include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
19#endif
20
21
22// Building for T-Head C906 core with RVV 0.7.1 using toolchain
23// https://github.com/T-head-Semi/xuantie-gnu-toolchain
24// with option '-march=rv64gcv0p7'
25#ifdef __THEAD_VERSION__
26# if __riscv_v == 7000
27# include <fenv.h>
28# define CV_RVV_THEAD_0_7
29# endif
30#endif
31
32namespace cv
33{
34
36
37CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
38
39#define CV_SIMD128 1
40#ifndef CV_RVV_THEAD_0_7
41# define CV_SIMD128_64F 1
42#else
43# define CV_SIMD128_64F 0
44#endif
45
47// The following types have been defined in clang, but not in GCC yet.
48#ifndef __clang__
49
50struct vuint8mf2_t
51{
52 uchar val[8] = {0};
53 vuint8mf2_t() {}
54 vuint8mf2_t(const uchar* ptr)
55 {
56 for (int i = 0; i < 8; ++i)
57 {
58 val[i] = ptr[i];
59 }
60 }
61};
62struct vint8mf2_t
63{
64 schar val[8] = {0};
65 vint8mf2_t() {}
66 vint8mf2_t(const schar* ptr)
67 {
68 for (int i = 0; i < 8; ++i)
69 {
70 val[i] = ptr[i];
71 }
72 }
73};
74struct vuint16mf2_t
75{
76 ushort val[4] = {0};
77 vuint16mf2_t() {}
78 vuint16mf2_t(const ushort* ptr)
79 {
80 for (int i = 0; i < 4; ++i)
81 {
82 val[i] = ptr[i];
83 }
84 }
85};
86struct vint16mf2_t
87{
88 short val[4] = {0};
89 vint16mf2_t() {}
90 vint16mf2_t(const short* ptr)
91 {
92 for (int i = 0; i < 4; ++i)
93 {
94 val[i] = ptr[i];
95 }
96 }
97};
98struct vuint32mf2_t
99{
100 unsigned val[2] = {0};
101 vuint32mf2_t() {}
102 vuint32mf2_t(const unsigned* ptr)
103 {
104 val[0] = ptr[0];
105 val[1] = ptr[1];
106 }
107};
108struct vint32mf2_t
109{
110 int val[2] = {0};
111 vint32mf2_t() {}
112 vint32mf2_t(const int* ptr)
113 {
114 val[0] = ptr[0];
115 val[1] = ptr[1];
116 }
117};
118struct vfloat32mf2_t
119{
120 float val[2] = {0};
121 vfloat32mf2_t() {}
122 vfloat32mf2_t(const float* ptr)
123 {
124 val[0] = ptr[0];
125 val[1] = ptr[1];
126 }
127};
128struct vuint64mf2_t
129{
130 uint64 val[1] = {0};
131 vuint64mf2_t() {}
132 vuint64mf2_t(const uint64* ptr)
133 {
134 val[0] = ptr[0];
135 }
136};
137struct vint64mf2_t
138{
139 int64 val[1] = {0};
140 vint64mf2_t() {}
141 vint64mf2_t(const int64* ptr)
142 {
143 val[0] = ptr[0];
144 }
145};
146struct vfloat64mf2_t
147{
148 double val[1] = {0};
149 vfloat64mf2_t() {}
150 vfloat64mf2_t(const double* ptr)
151 {
152 val[0] = ptr[0];
153 }
154};
155struct vuint8mf4_t
156{
157 uchar val[4] = {0};
158 vuint8mf4_t() {}
159 vuint8mf4_t(const uchar* ptr)
160 {
161 for (int i = 0; i < 4; ++i)
162 {
163 val[i] = ptr[i];
164 }
165 }
166};
167struct vint8mf4_t
168{
169 schar val[4] = {0};
170 vint8mf4_t() {}
171 vint8mf4_t(const schar* ptr)
172 {
173 for (int i = 0; i < 4; ++i)
174 {
175 val[i] = ptr[i];
176 }
177 }
178};
179
180#define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
181inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr, size_t vl) \
182{ \
183 CV_UNUSED(vl); \
184 return _Tpvec(ptr); \
185} \
186inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v, size_t vl) \
187{ \
188 CV_UNUSED(vl); \
189 for (int i = 0; i < n; ++i) \
190 { \
191 ptr[i] = v.val[i]; \
192 } \
193}
194
195OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
196OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
197OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
198OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
199OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
200OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
201OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
202OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
203OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
204OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
205
206
207#define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
208inline _Tpwvec wcvt (_Tpvec v, size_t vl) \
209{ \
210 _wTp tmp[n]; \
211 for (int i = 0; i < n; ++i) \
212 { \
213 tmp[i] = (_wTp)v.val[i]; \
214 } \
215 return vle##width##_v_##suffix##m1(tmp, vl); \
216}
217
218OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
219OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t, short, vwcvt_x_x_v_i16m1, i16, 16, 8)
220OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t, unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
221OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1, i32, 32, 4)
222OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
223OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
224
225inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base, size_t vl)
226{
227 CV_UNUSED(vl);
228 return vuint8mf4_t(base);
229}
230inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base, size_t vl)
231{
232 CV_UNUSED(vl);
233 return vint8mf4_t(base);
234}
235
236inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src, size_t vl)
237{
238 ushort tmp[4];
239 for (int i = 0; i < 4; ++i)
240 {
241 tmp[i] = (ushort)src.val[i];
242 }
243 return vle16_v_u16mf2(tmp, vl);
244}
245inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src, size_t vl)
246{
247 short tmp[4];
248 for (int i = 0; i < 4; ++i)
249 {
250 tmp[i] = (short)src.val[i];
251 }
252 return vle16_v_i16mf2(tmp, vl);
253}
254#endif
255
257
258#ifndef __clang__
259struct v_uint8x16
260{
261 typedef uchar lane_type;
262 enum { nlanes = 16 };
263
264 v_uint8x16() {}
265 explicit v_uint8x16(vuint8m1_t v)
266 {
267 vse8_v_u8m1(val, v, nlanes);
268 }
269 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
270 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
271 {
272 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
273 for (int i = 0; i < nlanes; ++i)
274 {
275 val[i] = v[i];
276 }
277 }
278 operator vuint8m1_t() const
279 {
280 return vle8_v_u8m1(val, nlanes);
281 }
282 uchar get0() const
283 {
284 return val[0];
285 }
286
287 uchar val[16];
288};
289
290struct v_int8x16
291{
292 typedef schar lane_type;
293 enum { nlanes = 16 };
294
295 v_int8x16() {}
296 explicit v_int8x16(vint8m1_t v)
297 {
298 vse8_v_i8m1(val, v, nlanes);
299 }
300 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
301 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
302 {
303 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
304 for (int i = 0; i < nlanes; ++i)
305 {
306 val[i] = v[i];
307 }
308 }
309 operator vint8m1_t() const
310 {
311 return vle8_v_i8m1(val, nlanes);
312 }
313 schar get0() const
314 {
315 return val[0];
316 }
317
318 schar val[16];
319};
320
321struct v_uint16x8
322{
323 typedef ushort lane_type;
324 enum { nlanes = 8 };
325
326 v_uint16x8() {}
327 explicit v_uint16x8(vuint16m1_t v)
328 {
329 vse16_v_u16m1(val, v, nlanes);
330 }
331 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
332 {
333 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
334 for (int i = 0; i < nlanes; ++i)
335 {
336 val[i] = v[i];
337 }
338 }
339 operator vuint16m1_t() const
340 {
341 return vle16_v_u16m1(val, nlanes);
342 }
343 ushort get0() const
344 {
345 return val[0];
346 }
347
348 ushort val[8];
349};
350
351struct v_int16x8
352{
353 typedef short lane_type;
354 enum { nlanes = 8 };
355
356 v_int16x8() {}
357 explicit v_int16x8(vint16m1_t v)
358 {
359 vse16_v_i16m1(val, v, nlanes);
360 }
361 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
362 {
363 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
364 for (int i = 0; i < nlanes; ++i)
365 {
366 val[i] = v[i];
367 }
368 }
369 operator vint16m1_t() const
370 {
371 return vle16_v_i16m1(val, nlanes);
372 }
373 short get0() const
374 {
375 return val[0];
376 }
377
378 short val[8];
379};
380
381struct v_uint32x4
382{
383 typedef unsigned lane_type;
384 enum { nlanes = 4 };
385
386 v_uint32x4() {}
387 explicit v_uint32x4(vuint32m1_t v)
388 {
389 vse32_v_u32m1(val, v, nlanes);
390 }
391 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
392 {
393 unsigned v[] = {v0, v1, v2, v3};
394 for (int i = 0; i < nlanes; ++i)
395 {
396 val[i] = v[i];
397 }
398 }
399 operator vuint32m1_t() const
400 {
401 return vle32_v_u32m1(val, nlanes);
402 }
403 unsigned get0() const
404 {
405 return val[0];
406 }
407
408 unsigned val[4];
409};
410
411struct v_int32x4
412{
413 typedef int lane_type;
414 enum { nlanes = 4 };
415
416 v_int32x4() {}
417 explicit v_int32x4(vint32m1_t v)
418 {
419 vse32_v_i32m1(val, v, nlanes);
420 }
421 v_int32x4(int v0, int v1, int v2, int v3)
422 {
423 int v[] = {v0, v1, v2, v3};
424 for (int i = 0; i < nlanes; ++i)
425 {
426 val[i] = v[i];
427 }
428 }
429 operator vint32m1_t() const
430 {
431 return vle32_v_i32m1(val, nlanes);
432 }
433 int get0() const
434 {
435 return val[0];
436 }
437 int val[4];
438};
439
440struct v_float32x4
441{
442 typedef float lane_type;
443 enum { nlanes = 4 };
444
445 v_float32x4() {}
446 explicit v_float32x4(vfloat32m1_t v)
447 {
448 vse32_v_f32m1(val, v, nlanes);
449 }
450 v_float32x4(float v0, float v1, float v2, float v3)
451 {
452 float v[] = {v0, v1, v2, v3};
453 for (int i = 0; i < nlanes; ++i)
454 {
455 val[i] = v[i];
456 }
457 }
458 operator vfloat32m1_t() const
459 {
460 return vle32_v_f32m1(val, nlanes);
461 }
462 float get0() const
463 {
464 return val[0];
465 }
466 float val[4];
467};
468
469struct v_uint64x2
470{
471 typedef uint64 lane_type;
472 enum { nlanes = 2 };
473
474 v_uint64x2() {}
475 explicit v_uint64x2(vuint64m1_t v)
476 {
477 vse64_v_u64m1(val, v, nlanes);
478 }
479 v_uint64x2(uint64 v0, uint64 v1)
480 {
481 uint64 v[] = {v0, v1};
482 for (int i = 0; i < nlanes; ++i)
483 {
484 val[i] = v[i];
485 }
486 }
487 operator vuint64m1_t() const
488 {
489 return vle64_v_u64m1(val, nlanes);
490 }
491 uint64 get0() const
492 {
493 return val[0];
494 }
495
496 uint64 val[2];
497};
498
499struct v_int64x2
500{
501 typedef int64 lane_type;
502 enum { nlanes = 2 };
503
504 v_int64x2() {}
505 explicit v_int64x2(vint64m1_t v)
506 {
507 vse64_v_i64m1(val, v, nlanes);
508 }
509 v_int64x2(int64 v0, int64 v1)
510 {
511 int64 v[] = {v0, v1};
512 for (int i = 0; i < nlanes; ++i)
513 {
514 val[i] = v[i];
515 }
516 }
517 operator vint64m1_t() const
518 {
519 return vle64_v_i64m1(val, nlanes);
520 }
521 int64 get0() const
522 {
523 return val[0];
524 }
525
526 int64 val[2];
527};
528
529#if CV_SIMD128_64F
530struct v_float64x2
531{
532 typedef double lane_type;
533 enum { nlanes = 2 };
534
535 v_float64x2() {}
536 explicit v_float64x2(vfloat64m1_t v)
537 {
538 vse64_v_f64m1(val, v, nlanes);
539 }
540 v_float64x2(double v0, double v1)
541 {
542 double v[] = {v0, v1};
543 for (int i = 0; i < nlanes; ++i)
544 {
545 val[i] = v[i];
546 }
547 }
548 operator vfloat64m1_t() const
549 {
550 return vle64_v_f64m1(val, nlanes);
551 }
552 double get0() const
553 {
554 return val[0];
555 }
556
557 double val[2];
558};
559#endif
560#else
561struct v_uint8x16
562{
563 typedef uchar lane_type;
564 enum { nlanes = 16 };
565
566 v_uint8x16() {}
567 explicit v_uint8x16(vuint8m1_t v)
568 {
569 *pval = v;
570 }
571 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
572 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
573 {
574 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
575 *pval = vle8_v_u8m1(v, nlanes);
576 }
577 operator vuint8m1_t() const
578 {
579 return *pval;
580 }
581 uchar get0() const
582 {
583 return vmv_x(*pval);
584 }
585 inline v_uint8x16& operator=(const v_uint8x16& vec) {
586 *pval = *(vec.pval);
587 return *this;
588 }
589 inline v_uint8x16(const v_uint8x16& vec) {
590 *pval = *(vec.pval);
591 }
592 uchar val[16];
593 vuint8m1_t* pval = (vuint8m1_t*)val;
594};
595
596struct v_int8x16
597{
598 typedef schar lane_type;
599 enum { nlanes = 16 };
600
601 v_int8x16() {}
602 explicit v_int8x16(vint8m1_t v)
603 {
604 *pval = v;
605 }
606 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
607 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
608 {
609 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
610 *pval = vle8_v_i8m1(v, nlanes);
611 }
612 operator vint8m1_t() const
613 {
614 return *pval;
615 }
616 schar get0() const
617 {
618 return vmv_x(*pval);
619 }
620 inline v_int8x16& operator=(const v_int8x16& vec) {
621 *pval = *(vec.pval);
622 return *this;
623 }
624 inline v_int8x16(const v_int8x16& vec) {
625 *pval = *(vec.pval);
626 }
627 schar val[16];
628 vint8m1_t* pval = (vint8m1_t*)val;
629};
630
631struct v_uint16x8
632{
633 typedef ushort lane_type;
634 enum { nlanes = 8 };
635
636 v_uint16x8() {}
637 explicit v_uint16x8(vuint16m1_t v)
638 {
639 *pval = v;
640 }
641 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
642 {
643 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
644 *pval = vle16_v_u16m1(v, nlanes);
645 }
646 operator vuint16m1_t() const
647 {
648 return *pval;
649 }
650 ushort get0() const
651 {
652 return vmv_x(*pval);
653 }
654
655 inline v_uint16x8& operator=(const v_uint16x8& vec) {
656 *pval = *(vec.pval);
657 return *this;
658 }
659 inline v_uint16x8(const v_uint16x8& vec) {
660 *pval = *(vec.pval);
661 }
662 ushort val[8];
663 vuint16m1_t* pval = (vuint16m1_t*)val;
664};
665
666struct v_int16x8
667{
668 typedef short lane_type;
669 enum { nlanes = 8 };
670
671 v_int16x8() {}
672 explicit v_int16x8(vint16m1_t v)
673 {
674 *pval = v;
675 }
676 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
677 {
678 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
679 *pval = vle16_v_i16m1(v, nlanes);
680 }
681 operator vint16m1_t() const
682 {
683 return *pval;
684 }
685 short get0() const
686 {
687 return vmv_x(*pval);
688 }
689
690 inline v_int16x8& operator=(const v_int16x8& vec) {
691 *pval = *(vec.pval);
692 return *this;
693 }
694 inline v_int16x8(const v_int16x8& vec) {
695 *pval = *(vec.pval);
696 }
697 short val[8];
698 vint16m1_t* pval = (vint16m1_t*)val;
699};
700
701struct v_uint32x4
702{
703 typedef unsigned lane_type;
704 enum { nlanes = 4 };
705
706 v_uint32x4() {}
707 explicit v_uint32x4(vuint32m1_t v)
708 {
709 *pval = v;
710 }
711 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
712 {
713 unsigned v[] = {v0, v1, v2, v3};
714 *pval = vle32_v_u32m1(v, nlanes);
715 }
716 operator vuint32m1_t() const
717 {
718 return *pval;
719 }
720 unsigned get0() const
721 {
722 return vmv_x(*pval);
723 }
724
725 inline v_uint32x4& operator=(const v_uint32x4& vec) {
726 *pval = *(vec.pval);
727 return *this;
728 }
729 inline v_uint32x4(const v_uint32x4& vec) {
730 *pval = *(vec.pval);
731 }
732 unsigned val[4];
733 vuint32m1_t* pval = (vuint32m1_t*)val;
734};
735
736struct v_int32x4
737{
738 typedef int lane_type;
739 enum { nlanes = 4 };
740
741 v_int32x4() {}
742 explicit v_int32x4(vint32m1_t v)
743 {
744 *pval = v;
745 }
746 v_int32x4(int v0, int v1, int v2, int v3)
747 {
748 int v[] = {v0, v1, v2, v3};
749 *pval = vle32_v_i32m1(v, nlanes);
750 }
751 operator vint32m1_t() const
752 {
753 return *pval;
754 }
755 int get0() const
756 {
757 return vmv_x(*pval);
758 }
759
760 inline v_int32x4& operator=(const v_int32x4& vec) {
761 *pval = *(vec.pval);
762 return *this;
763 }
764 inline v_int32x4(const v_int32x4& vec) {
765 *pval = *(vec.pval);
766 }
767 int val[4];
768 vint32m1_t* pval = (vint32m1_t*)val;
769};
770
771struct v_float32x4
772{
773 typedef float lane_type;
774 enum { nlanes = 4 };
775
776 v_float32x4() {}
777 explicit v_float32x4(vfloat32m1_t v)
778 {
779 *pval = v;
780 }
781 v_float32x4(float v0, float v1, float v2, float v3)
782 {
783 float v[] = {v0, v1, v2, v3};
784 *pval = vle32_v_f32m1(v, nlanes);
785 }
786 operator vfloat32m1_t() const
787 {
788 return *pval;
789 }
790 float get0() const
791 {
792 return vfmv_f(*pval);
793 }
794 inline v_float32x4& operator=(const v_float32x4& vec) {
795 *pval = *(vec.pval);
796 return *this;
797 }
798 inline v_float32x4(const v_float32x4& vec) {
799 *pval = *(vec.pval);
800 }
801 float val[4];
802 vfloat32m1_t* pval = (vfloat32m1_t*)val;
803};
804
805struct v_uint64x2
806{
807 typedef uint64 lane_type;
808 enum { nlanes = 2 };
809
810 v_uint64x2() {}
811 explicit v_uint64x2(vuint64m1_t v)
812 {
813 *pval = v;
814 }
815 v_uint64x2(uint64 v0, uint64 v1)
816 {
817 uint64 v[] = {v0, v1};
818 *pval = vle64_v_u64m1(v, nlanes);
819 }
820 operator vuint64m1_t() const
821 {
822 return *pval;
823 }
824 uint64 get0() const
825 {
826 return vmv_x(*pval);
827 }
828
829 inline v_uint64x2& operator=(const v_uint64x2& vec) {
830 *pval = *(vec.pval);
831 return *this;
832 }
833 inline v_uint64x2(const v_uint64x2& vec) {
834 *pval = *(vec.pval);
835 }
836 uint64 val[2];
837 vuint64m1_t* pval = (vuint64m1_t*)val;
838};
839
840struct v_int64x2
841{
842 typedef int64 lane_type;
843 enum { nlanes = 2 };
844
845 v_int64x2() {}
846 explicit v_int64x2(vint64m1_t v)
847 {
848 *pval = v;
849 }
850 v_int64x2(int64 v0, int64 v1)
851 {
852 int64 v[] = {v0, v1};
853 *pval = vle64_v_i64m1(v, nlanes);
854 }
855 operator vint64m1_t() const
856 {
857 return *pval;
858 }
859 int64 get0() const
860 {
861 return vmv_x(*pval);
862 }
863
864 inline v_int64x2& operator=(const v_int64x2& vec) {
865 *pval = *(vec.pval);
866 return *this;
867 }
868 inline v_int64x2(const v_int64x2& vec) {
869 *pval = *(vec.pval);
870 }
871 int64 val[2];
872 vint64m1_t* pval = (vint64m1_t*)val;
873};
874
875#if CV_SIMD128_64F
876struct v_float64x2
877{
878 typedef double lane_type;
879 enum { nlanes = 2 };
880
881 v_float64x2() {}
882 explicit v_float64x2(vfloat64m1_t v)
883 {
884 *pval = v;
885 }
886 v_float64x2(double v0, double v1)
887 {
888 double v[] = {v0, v1};
889 *pval = vle64_v_f64m1(v, nlanes);
890 }
891 operator vfloat64m1_t() const
892 {
893 return *pval;
894 }
895 double get0() const
896 {
897 return vfmv_f(*pval);
898 }
899
900 inline v_float64x2& operator=(const v_float64x2& vec) {
901 *pval = *(vec.pval);
902 return *this;
903 }
904 inline v_float64x2(const v_float64x2& vec) {
905 *pval = *(vec.pval);
906 }
907 double val[2];
908 vfloat64m1_t* pval = (vfloat64m1_t*)val;
909};
910#endif // CV_SIMD128_64F
911#endif // __clang__
912
914
915#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
916inline v_##_Tpvec v_setzero_##suffix1() \
917{ \
918 return v_##_Tpvec(vmv_v_x_##suffix2##m1(0, vl)); \
919} \
920inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
921{ \
922 return v_##_Tpvec(vmv_v_x_##suffix2##m1(v, vl)); \
923}
924
925OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, u8, u8, 16)
926OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, s8, i8, 16)
927OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, u16, u16, 8)
928OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, s16, i16, 8)
929OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, u32, u32, 4)
930OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, s32, i32, 4)
931OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, u64, u64, 2)
932OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, s64, i64, 2)
933
934#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
935inline v_##_Tpv v_setzero_##suffix() \
936{ \
937 return v_##_Tpv(vfmv_v_f_##suffix##m1(0, vl)); \
938} \
939inline v_##_Tpv v_setall_##suffix(_Tp v) \
940{ \
941 return v_##_Tpv(vfmv_v_f_##suffix##m1(v, vl)); \
942}
943
944OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, f32, 4)
945#if CV_SIMD128_64F
946OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, f64, 2)
947#endif
948
950
951#define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
952inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
953
954OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
955OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
956OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
957OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
958OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
959OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
960OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
961OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
962OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
963#if CV_SIMD128_64F
964OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
965#endif
966
967#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
968inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
969{ \
970 return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
971} \
972inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
973{ \
974 return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
975}
976
977OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, int8x16, u8, s8, u8, i8)
978OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, int16x8, u16, s16, u16, i16)
979OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, int32x4, u32, s32, u32, i32)
980OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, float32x4, u32, f32, u32, f32)
981OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, float32x4, s32, f32, i32, f32)
982OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, int64x2, u64, s64, u64, i64)
983#if CV_SIMD128_64F
984OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, float64x2, u64, f64, u64, f64)
985OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64x2, float64x2, s64, f64, i64, f64)
986#endif
987OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint16x8, u8, u16, u8, u16)
988OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint32x4, u8, u32, u8, u32)
989OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint64x2, u8, u64, u8, u64)
990OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint32x4, u16, u32, u16, u32)
991OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint64x2, u16, u64, u16, u64)
992OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, uint64x2, u32, u64, u32, u64)
993OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int16x8, s8, s16, i8, i16)
994OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int32x4, s8, s32, i8, i32)
995OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int64x2, s8, s64, i8, i64)
996OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int32x4, s16, s32, i16, i32)
997OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int64x2, s16, s64, i16, i64)
998OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, int64x2, s32, s64, i32, i64)
999
1000
1001#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
1002inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
1003{ \
1004 return v_##_Tpvec1(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v)));\
1005} \
1006inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
1007{ \
1008 return v_##_Tpvec2(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v)));\
1009}
1010
1011OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int16x8, u8, s16, u, i, 8, 16)
1012OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int32x4, u8, s32, u, i, 8, 32)
1013OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int64x2, u8, s64, u, i, 8, 64)
1014OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int8x16, u16, s8, u, i, 16, 8)
1015OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int32x4, u16, s32, u, i, 16, 32)
1016OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int64x2, u16, s64, u, i, 16, 64)
1017OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int8x16, u32, s8, u, i, 32, 8)
1018OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int16x8, u32, s16, u, i, 32, 16)
1019OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int64x2, u32, s64, u, i, 32, 64)
1020OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int8x16, u64, s8, u, i, 64, 8)
1021OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int16x8, u64, s16, u, i, 64, 16)
1022OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int32x4, u64, s32, u, i, 64, 32)
1023OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float32x4, u8, f32, u, f, 8, 32)
1024OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float32x4, u16, f32, u, f, 16, 32)
1025OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, float32x4, u64, f32, u, f, 64, 32)
1026OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float32x4, s8, f32, i, f, 8, 32)
1027OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float32x4, s16, f32, i, f, 16, 32)
1028OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64x2, float32x4, s64, f32, i, f, 64, 32)
1029#if CV_SIMD128_64F
1030OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float64x2, u8, f64, u, f, 8, 64)
1031OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float64x2, u16, f64, u, f, 16, 64)
1032OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, float64x2, u32, f64, u, f, 32, 64)
1033OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float64x2, s8, f64, i, f, 8, 64)
1034OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float64x2, s16, f64, i, f, 16, 64)
1035OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32x4, float64x2, s32, f64, i, f, 32, 64)
1036#endif
1037
1038// Three times reinterpret
1039#if CV_SIMD128_64F
1040inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) \
1041{ \
1042 return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v))));\
1043} \
1044inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) \
1045{ \
1046 return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v))));\
1047}
1048#endif
1049
1051
1052#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vmv, vl) \
1053template <int s> \
1054inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1055{ \
1056 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
1057} \
1058template<int i> inline _Tp v_extract_n(_Tpvec v) \
1059{ \
1060 return _Tp(vmv(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), v, i, vl))); \
1061}
1062
1063
1064OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8x16, uchar, u8, vmv_x_s_u8m1_u8, 16)
1065OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8x16, schar, i8, vmv_x_s_i8m1_i8, 16)
1066OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16x8, ushort, u16, vmv_x_s_u16m1_u16, 8)
1067OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16x8, short, i16, vmv_x_s_i16m1_i16, 8)
1068OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32x4, uint, u32, vmv_x_s_u32m1_u32, 4)
1069OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32x4, int, i32, vmv_x_s_i32m1_i32, 4)
1070OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64x2, uint64, u64, vmv_x_s_u64m1_u64, 2)
1071OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64x2, int64, i64, vmv_x_s_i64m1_i64, 2)
1072
1073#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vmv, vl) \
1074template <int s> \
1075inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1076{ \
1077 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
1078} \
1079template<int i> inline _Tp v_extract_n(_Tpvec v) \
1080{ \
1081 return _Tp(vmv(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), v, i, vl))); \
1082}
1083
1084OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32x4, float, f32, vfmv_f_s_f32m1_f32, 4)
1085#if CV_SIMD128_64F
1086OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64x2, double, f64, vfmv_f_s_f64m1_f64, 2)
1087#endif
1088
1090
1091#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
1092inline _Tpvec v_load(const _Tp* ptr) \
1093{ \
1094 return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
1095} \
1096inline _Tpvec v_load_aligned(const _Tp* ptr) \
1097{ \
1098 return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
1099} \
1100inline _Tpvec v_load_low(const _Tp* ptr) \
1101{ \
1102 _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
1103 return res; \
1104} \
1105inline void v_store(_Tp* ptr, const _Tpvec& a) \
1106{ \
1107 vse##width##_v_##suffix##m1(ptr, a, vl); \
1108} \
1109inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1110{ \
1111 vse##width##_v_##suffix##m1(ptr, a, vl); \
1112} \
1113inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1114{ \
1115 vse##width##_v_##suffix##m1(ptr, a, vl); \
1116} \
1117inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1118{ \
1119 vse##width##_v_##suffix##m1(ptr, a, vl); \
1120} \
1121inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1122{ \
1123 vse##width##_v_##suffix##m1(ptr, a, hvl); \
1124} \
1125inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1126{ \
1127 vse##width##_v_##suffix##m1(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
1128}
1129
1130OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 16, 8, u8, vmv_v_x_u8m1)
1131OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 16, 8, i8, vmv_v_x_i8m1)
1132OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 8, 16, u16, vmv_v_x_u16m1)
1133OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 8, 16, i16, vmv_v_x_i16m1)
1134OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 4, 32, u32, vmv_v_x_u32m1)
1135OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 4, 32, i32, vmv_v_x_i32m1)
1136OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 2, 64, u64, vmv_v_x_u64m1)
1137OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 2, 64, i64, vmv_v_x_i64m1)
1138OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 4, 32, f32, vfmv_v_f_f32m1)
1139#if CV_SIMD128_64F
1140OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 2, 64, f64, vfmv_v_f_f64m1)
1141#endif
1142
1143inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
1144{
1145 schar elems[16] =
1146 {
1147 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
1148 ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
1149 };
1150 return v_int8x16(vle8_v_i8m1(elems, 16));
1151}
1152inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); }
1153
1154inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
1155{
1156 short elems[8] =
1157 {
1158 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
1159 };
1160 return v_int16x8(vle16_v_i16m1(elems, 8));
1161}
1162inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); }
1163
1164inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
1165{
1166 int elems[4] =
1167 {
1168 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
1169 };
1170 return v_int32x4(vle32_v_i32m1(elems, 4));
1171}
1172inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
1173{
1174 float elems[4] =
1175 {
1176 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
1177 };
1178 return v_float32x4(vle32_v_f32m1(elems, 4));
1179}
1180inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); }
1181
1182inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
1183{
1184 int64 elems[2] =
1185 {
1186 ptr0[0], ptr1[0]
1187 };
1188 return v_int64x2(vle64_v_i64m1(elems, 2));
1189}
1190inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); }
1191
1192#if CV_SIMD128_64F
1193inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
1194{
1195 double elems[2] =
1196 {
1197 ptr0[0], ptr1[0]
1198 };
1199 return v_float64x2(vle64_v_f64m1(elems, 2));
1200}
1201#endif
1202
1203
1205
1206inline v_int8x16 v_lut(const schar* tab, const int* idx)
1207{
1208 schar elems[16] =
1209 {
1210 tab[idx[ 0]],
1211 tab[idx[ 1]],
1212 tab[idx[ 2]],
1213 tab[idx[ 3]],
1214 tab[idx[ 4]],
1215 tab[idx[ 5]],
1216 tab[idx[ 6]],
1217 tab[idx[ 7]],
1218 tab[idx[ 8]],
1219 tab[idx[ 9]],
1220 tab[idx[10]],
1221 tab[idx[11]],
1222 tab[idx[12]],
1223 tab[idx[13]],
1224 tab[idx[14]],
1225 tab[idx[15]]
1226 };
1227 return v_int8x16(vle8_v_i8m1(elems, 16));
1228}
1229inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
1230{
1231 schar elems[16] =
1232 {
1233 tab[idx[0]],
1234 tab[idx[0] + 1],
1235 tab[idx[1]],
1236 tab[idx[1] + 1],
1237 tab[idx[2]],
1238 tab[idx[2] + 1],
1239 tab[idx[3]],
1240 tab[idx[3] + 1],
1241 tab[idx[4]],
1242 tab[idx[4] + 1],
1243 tab[idx[5]],
1244 tab[idx[5] + 1],
1245 tab[idx[6]],
1246 tab[idx[6] + 1],
1247 tab[idx[7]],
1248 tab[idx[7] + 1]
1249 };
1250 return v_int8x16(vle8_v_i8m1(elems, 16));
1251}
1252inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1253{
1254 schar elems[16] =
1255 {
1256 tab[idx[0]],
1257 tab[idx[0] + 1],
1258 tab[idx[0] + 2],
1259 tab[idx[0] + 3],
1260 tab[idx[1]],
1261 tab[idx[1] + 1],
1262 tab[idx[1] + 2],
1263 tab[idx[1] + 3],
1264 tab[idx[2]],
1265 tab[idx[2] + 1],
1266 tab[idx[2] + 2],
1267 tab[idx[2] + 3],
1268 tab[idx[3]],
1269 tab[idx[3] + 1],
1270 tab[idx[3] + 2],
1271 tab[idx[3] + 3]
1272 };
1273 return v_int8x16(vle8_v_i8m1(elems, 16));
1274}
1275inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
1276inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
1277inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1278
1279inline v_int16x8 v_lut(const short* tab, const int* idx)
1280{
1281 short elems[8] =
1282 {
1283 tab[idx[0]],
1284 tab[idx[1]],
1285 tab[idx[2]],
1286 tab[idx[3]],
1287 tab[idx[4]],
1288 tab[idx[5]],
1289 tab[idx[6]],
1290 tab[idx[7]]
1291 };
1292 return v_int16x8(vle16_v_i16m1(elems, 8));
1293}
1294inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1295{
1296 short elems[8] =
1297 {
1298 tab[idx[0]],
1299 tab[idx[0] + 1],
1300 tab[idx[1]],
1301 tab[idx[1] + 1],
1302 tab[idx[2]],
1303 tab[idx[2] + 1],
1304 tab[idx[3]],
1305 tab[idx[3] + 1]
1306 };
1307 return v_int16x8(vle16_v_i16m1(elems, 8));
1308}
1309inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1310{
1311 short elems[8] =
1312 {
1313 tab[idx[0]],
1314 tab[idx[0] + 1],
1315 tab[idx[0] + 2],
1316 tab[idx[0] + 3],
1317 tab[idx[1]],
1318 tab[idx[1] + 1],
1319 tab[idx[1] + 2],
1320 tab[idx[1] + 3]
1321 };
1322 return v_int16x8(vle16_v_i16m1(elems, 8));
1323}
1324inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
1325inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
1326inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
1327
1328inline v_int32x4 v_lut(const int* tab, const int* idx)
1329{
1330 int elems[4] =
1331 {
1332 tab[idx[0]],
1333 tab[idx[1]],
1334 tab[idx[2]],
1335 tab[idx[3]]
1336 };
1337 return v_int32x4(vle32_v_i32m1(elems, 4));
1338}
1339inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1340{
1341 int elems[4] =
1342 {
1343 tab[idx[0]],
1344 tab[idx[0] + 1],
1345 tab[idx[1]],
1346 tab[idx[1] + 1]
1347 };
1348 return v_int32x4(vle32_v_i32m1(elems, 4));
1349}
1350inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1351{
1352 return v_int32x4(vle32_v_i32m1(tab + idx[0], 4));
1353}
1354
1355inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
1356inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
1357inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1358
1359inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1360{
1361 int64_t elems[2] =
1362 {
1363 tab[idx[0]],
1364 tab[idx[1]]
1365 };
1366 return v_int64x2(vle64_v_i64m1(elems, 2));
1367}
1368inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx)
1369{
1370 return v_int64x2(vle64_v_i64m1(tab + idx[0], 2));
1371}
1372inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
1373inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1374
1375inline v_float32x4 v_lut(const float* tab, const int* idx)
1376{
1377 float elems[4] =
1378 {
1379 tab[idx[0]],
1380 tab[idx[1]],
1381 tab[idx[2]],
1382 tab[idx[3]]
1383 };
1384 return v_float32x4(vle32_v_f32m1(elems, 4));
1385}
1386inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1387{
1388 float elems[4] =
1389 {
1390 tab[idx[0]],
1391 tab[idx[0] + 1],
1392 tab[idx[1]],
1393 tab[idx[1] + 1]
1394 };
1395 return v_float32x4(vle32_v_f32m1(elems, 4));
1396}
1397inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1398{
1399 return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
1400}
1401
1402inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1403{
1404 int elems[4] =
1405 {
1406 tab[v_extract_n<0>(idxvec)],
1407 tab[v_extract_n<1>(idxvec)],
1408 tab[v_extract_n<2>(idxvec)],
1409 tab[v_extract_n<3>(idxvec)]
1410 };
1411 return v_int32x4(vle32_v_i32m1(elems, 4));
1412}
1413
1414inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1415{
1416 unsigned elems[4] =
1417 {
1418 tab[v_extract_n<0>(idxvec)],
1419 tab[v_extract_n<1>(idxvec)],
1420 tab[v_extract_n<2>(idxvec)],
1421 tab[v_extract_n<3>(idxvec)]
1422 };
1423 return v_uint32x4(vle32_v_u32m1(elems, 4));
1424}
1425
1426inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1427{
1428 float elems[4] =
1429 {
1430 tab[v_extract_n<0>(idxvec)],
1431 tab[v_extract_n<1>(idxvec)],
1432 tab[v_extract_n<2>(idxvec)],
1433 tab[v_extract_n<3>(idxvec)]
1434 };
1435 return v_float32x4(vle32_v_f32m1(elems, 4));
1436}
1437
1438inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1439{
1440 int idx[4];
1441 v_store_aligned(idx, idxvec);
1442
1443 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1444 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
1445}
1446
1447#if CV_SIMD128_64F
1448inline v_float64x2 v_lut(const double* tab, const int* idx)
1449{
1450 double elems[2] =
1451 {
1452 tab[idx[0]],
1453 tab[idx[1]]
1454 };
1455 return v_float64x2(vle64_v_f64m1(elems, 2));
1456}
1457
1458inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1459{
1460 return v_float64x2(vle64_v_f64m1(tab + idx[0], 2));
1461}
1462
1463inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1464{
1465 double elems[2] =
1466 {
1467 tab[v_extract_n<0>(idxvec)],
1468 tab[v_extract_n<1>(idxvec)]
1469 };
1470 return v_float64x2(vle64_v_f64m1(elems, 2));
1471}
1472
1473inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1474{
1475 int idx[4] = {0};
1476 v_store_aligned(idx, idxvec);
1477
1478 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1479 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1480}
1481#endif
1482
1484
1485inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1486{
1487 ushort ptr[16] = {0};
1488 v_store(ptr, a);
1489 v_store(ptr + 8, b);
1490 return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr, 16), 0, 16));
1491}
1492
1493inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1494 const v_uint32x4& c, const v_uint32x4& d)
1495{
1496 unsigned ptr[16] = {0};
1497 v_store(ptr, a);
1498 v_store(ptr + 4, b);
1499 v_store(ptr + 8, c);
1500 v_store(ptr + 12, d);
1501 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr, 16), 0, 16), 0, 16));
1502}
1503
1504inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
1505 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
1506 const v_uint64x2& g, const v_uint64x2& h)
1507{
1508 uint64 ptr[16] = {0};
1509 v_store(ptr, a);
1510 v_store(ptr + 2, b);
1511 v_store(ptr + 4, c);
1512 v_store(ptr + 6, d);
1513 v_store(ptr + 8, e);
1514 v_store(ptr + 10, f);
1515 v_store(ptr + 12, g);
1516 v_store(ptr + 14, h);
1517 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr, 16), 0, 16), 0, 16), 0, 16));
1518}
1519
1521#define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, vl) \
1522inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1523{ \
1524 return _Tpvec(intrin(a, b, vl)); \
1525} \
1526inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
1527{ \
1528 a = _Tpvec(intrin(a, b, vl)); \
1529 return a; \
1530}
1531
1532OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 16)
1533OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 16)
1534OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 16)
1535OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 16)
1536OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 16)
1537OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 16)
1538OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 8)
1539OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 8)
1540OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 8)
1541OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 8)
1542OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 8)
1543OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 8)
1544OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 4)
1545OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 4)
1546OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 4)
1547OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 4)
1548OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 4)
1549OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 4)
1550OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 4)
1551OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 4)
1552OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 4)
1553OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 4)
1554OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 4)
1555OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 4)
1556OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 2)
1557OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 2)
1558OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 2)
1559OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 2)
1560OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 2)
1561OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 2)
1562OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 2)
1563OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 2)
1564#if CV_SIMD128_64F
1565OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 2)
1566OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 2)
1567OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 2)
1568OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 2)
1569#endif
1570
1571
1573
1574#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, vl) \
1575OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, vl) \
1576OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, vl) \
1577OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, vl) \
1578inline _Tpvec operator ~ (const _Tpvec& a) \
1579{ \
1580 return _Tpvec(vnot_v_##suffix##m1(a, vl)); \
1581}
1582
1583OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 16)
1584OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 16)
1585OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 8)
1586OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 8)
1587OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 4)
1588OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 4)
1589OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 2)
1590OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 2)
1591
1592#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
1593inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
1594{ \
1595 return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
1596} \
1597inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
1598{ \
1599 a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
1600 return a; \
1601}
1602
1603OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
1604OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
1605OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
1606
1607inline v_float32x4 operator ~ (const v_float32x4& a)
1608{
1609 return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a), 4)));
1610}
1611
1612#if CV_SIMD128_64F
1613#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
1614inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
1615{ \
1616 return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
1617} \
1618inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
1619{ \
1620 a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
1621 return a; \
1622}
1623
1624OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
1625OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
1626OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
1627
1628inline v_float64x2 operator ~ (const v_float64x2& a)
1629{
1630 return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a), 2)));
1631}
1632#endif
1633
1635
1636#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
1637inline _Tpvec operator << (const _Tpvec& a, int n) \
1638{ \
1639 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1640} \
1641inline _Tpvec operator >> (const _Tpvec& a, int n) \
1642{ \
1643 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
1644} \
1645template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1646{ \
1647 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1648} \
1649template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1650{ \
1651 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
1652}
1653
1654#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
1655inline _Tpvec operator << (const _Tpvec& a, int n) \
1656{ \
1657 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1658} \
1659inline _Tpvec operator >> (const _Tpvec& a, int n) \
1660{ \
1661 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
1662} \
1663template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1664{ \
1665 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
1666} \
1667template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1668{ \
1669 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
1670}
1671
1672OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 16)
1673OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 8)
1674OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 4)
1675OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 2)
1676OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 16)
1677OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 8)
1678OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 4)
1679OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 2)
1680
1681
1683
1684#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
1685inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1686{ \
1687 uint64_t ones = -1; \
1688 return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl)); \
1689}
1690
1691#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
1692inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1693{ \
1694 union { uint64 u; double d; } ones; ones.u = -1; \
1695 return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
1696}
1697
1698#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width, vl) \
1699OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
1700OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
1701OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, vl) \
1702OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, vl) \
1703OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, vl) \
1704OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, vl)
1705
1706#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width, vl) \
1707OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
1708OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
1709OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, vl) \
1710OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, vl) \
1711OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, vl) \
1712OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, vl)
1713
1714#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width, vl) \
1715OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, vl) \
1716OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, vl) \
1717OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, vl) \
1718OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, vl) \
1719OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, vl) \
1720OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, vl)
1721
1722
1723OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8, 16)
1724OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16, 8)
1725OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32, 4)
1726OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64, 2)
1727OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8, 16)
1728OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16, 8)
1729OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32, 4)
1730OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64, 2)
1731OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32, 4)
1732#if CV_SIMD128_64F
1733OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64, 2)
1734#endif
1735
1736inline v_float32x4 v_not_nan(const v_float32x4& a)
1737{ return a == a; }
1738
1739#if CV_SIMD128_64F
1740inline v_float64x2 v_not_nan(const v_float64x2& a)
1741{ return a == a; }
1742#endif
1743
1745
1746#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
1747inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1748{ \
1749 return _Tpvec(intrin(a, b, vl)); \
1750}
1751
1752OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
1753OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
1754OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
1755OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
1756OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
1757OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
1758OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
1759OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
1760OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
1761OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
1762OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
1763OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
1764OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
1765OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
1766OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 2)
1767OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 2)
1768OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 2)
1769OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 2)
1770#if CV_SIMD128_64F
1771OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
1772OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
1773#endif
1774
1776
1777OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
1778OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
1779OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
1780OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
1781OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
1782OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
1783OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
1784OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
1785OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
1786OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
1787OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
1788OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
1789
1791
1792#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
1793inline scalartype v_reduce_sum(const _Tpvec& a) \
1794{ \
1795 _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
1796 _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
1797 res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
1798 return (scalartype)(_wTpvec(res).get0()); \
1799}
1800
1801OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu)
1802OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum)
1803OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 8, wredsumu)
1804OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 8, wredsum)
1805OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 4, wredsumu)
1806OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 4, wredsum)
1807OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 2, redsum)
1808OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 2, redsum)
1809
1810#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
1811inline scalartype v_reduce_sum(const _Tpvec& a) \
1812{ \
1813 _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
1814 _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
1815 res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
1816 return (scalartype)(_wTpvec(res).get0()); \
1817}
1818
1819// vfredsum for float has renamed to fredosum, also updated in GNU.
1820OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 4, fredosum)
1821#if CV_SIMD128_64F
1822OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 2, fredosum)
1823#endif
1824
1825
1826#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
1827inline scalartype v_reduce_##func(const _Tpvec& a) \
1828{ \
1829 _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a, vl)); \
1830 return scalartype(res.get0()); \
1831}
1832
1833OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 16, redminu)
1834OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 16, redmin)
1835OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 8, redminu)
1836OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 8, redmin)
1837OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 4, redminu)
1838OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 4, redmin)
1839OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 4, fredmin)
1840OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 16, redmaxu)
1841OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 16, redmax)
1842OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 8, redmaxu)
1843OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 8, redmax)
1844OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 4, redmaxu)
1845OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 4, redmax)
1846OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 4, fredmax)
1847
1848
1849inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1850 const v_float32x4& c, const v_float32x4& d)
1851{
1852 float elems[4] =
1853 {
1854 v_reduce_sum(a),
1855 v_reduce_sum(b),
1856 v_reduce_sum(c),
1857 v_reduce_sum(d)
1858 };
1859 return v_float32x4(vle32_v_f32m1(elems, 4));
1860}
1861
1863
1864inline v_float32x4 v_sqrt(const v_float32x4& x)
1865{
1866 return v_float32x4(vfsqrt_v_f32m1(x, 4));
1867}
1868
1869inline v_float32x4 v_invsqrt(const v_float32x4& x)
1870{
1871 v_float32x4 one = v_setall_f32(1.0f);
1872 return one / v_sqrt(x);
1873}
1874
1875#if CV_SIMD128_64F
1876inline v_float64x2 v_sqrt(const v_float64x2& x)
1877{
1878 return v_float64x2(vfsqrt_v_f64m1(x, 4));
1879}
1880
1881inline v_float64x2 v_invsqrt(const v_float64x2& x)
1882{
1883 v_float64x2 one = v_setall_f64(1.0f);
1884 return one / v_sqrt(x);
1885}
1886#endif
1887
1888inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1889{
1890 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
1891 return v_sqrt(x);
1892}
1893
1894inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1895{
1896 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
1897}
1898
1899#if CV_SIMD128_64F
1900inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1901{
1902 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
1903 return v_sqrt(x);
1904}
1905
1906inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1907{
1908 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
1909}
1910#endif
1911
1913
1914inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1915{
1916 return v_float32x4(vfmacc_vv_f32m1(c, a, b, 4));
1917}
1918inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1919{
1920 return v_int32x4(vmacc_vv_i32m1(c, a, b, 4));
1921}
1922
1923inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1924{
1925 return v_fma(a, b, c);
1926}
1927
1928inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1929{
1930 return v_fma(a, b, c);
1931}
1932
1933#if CV_SIMD128_64F
1934inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1935{
1936 return v_float64x2(vfmacc_vv_f64m1(c, a, b, 2));
1937}
1938
1939inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1940{
1941 return v_fma(a, b, c);
1942}
1943#endif
1944
1946
1947// use overloaded vcpop in clang, no casting like (vuint64m1_t) is needed.
1948#ifndef __clang__
1949#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
1950inline bool v_check_all(const _Tpvec& a) \
1951{ \
1952 auto v0 = vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl); \
1953 v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
1954 return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) == 0; \
1955} \
1956inline bool v_check_any(const _Tpvec& a) \
1957{ \
1958 auto v0 = vsrl_vx_##suffix##m1(a, shift, vl); \
1959 v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
1960 return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) != 0; \
1961}
1962
1963OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 16)
1964OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 8)
1965OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 4)
1966//OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 2)
1967inline bool v_check_all(const v_uint64x2& a)
1968{
1969 v_uint64x2 v = v_uint64x2(vsrl_vx_u64m1(vnot_v_u64m1(a, 2), 63, 2));
1970 return (v.val[0] | v.val[1]) == 0;
1971}
1972inline bool v_check_any(const v_uint64x2& a)
1973{
1974 v_uint64x2 v = v_uint64x2(vsrl_vx_u64m1(a, 63, 2));
1975 return (v.val[0] | v.val[1]) != 0;
1976}
1977
1978inline bool v_check_all(const v_int8x16& a)
1979{ return v_check_all(v_reinterpret_as_u8(a)); }
1980inline bool v_check_any(const v_int8x16& a)
1981{ return v_check_any(v_reinterpret_as_u8(a)); }
1982
1983inline bool v_check_all(const v_int16x8& a)
1984{ return v_check_all(v_reinterpret_as_u16(a)); }
1985inline bool v_check_any(const v_int16x8& a)
1986{ return v_check_any(v_reinterpret_as_u16(a)); }
1987
1988inline bool v_check_all(const v_int32x4& a)
1989{ return v_check_all(v_reinterpret_as_u32(a)); }
1990inline bool v_check_any(const v_int32x4& a)
1991{ return v_check_any(v_reinterpret_as_u32(a)); }
1992
1993inline bool v_check_all(const v_float32x4& a)
1994{ return v_check_all(v_reinterpret_as_u32(a)); }
1995inline bool v_check_any(const v_float32x4& a)
1996{ return v_check_any(v_reinterpret_as_u32(a)); }
1997
1998inline bool v_check_all(const v_int64x2& a)
1999{ return v_check_all(v_reinterpret_as_u64(a)); }
2000inline bool v_check_any(const v_int64x2& a)
2001{ return v_check_any(v_reinterpret_as_u64(a)); }
2002
2003#if CV_SIMD128_64F
2004inline bool v_check_all(const v_float64x2& a)
2005{ return v_check_all(v_reinterpret_as_u64(a)); }
2006inline bool v_check_any(const v_float64x2& a)
2007{ return v_check_any(v_reinterpret_as_u64(a)); }
2008#endif
2009#else
2010#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
2011inline bool v_check_all(const _Tpvec& a) \
2012{ \
2013 return vcpop(vmslt(a, 0, vl), vl) == vl; \
2014} \
2015inline bool v_check_any(const _Tpvec& a) \
2016{ \
2017 return vcpop(vmslt(a, 0, vl), vl) != 0; \
2018}
2019
2020OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8x16, 16)
2021OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16x8, 8)
2022OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32x4, 4)
2023OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64x2, 2)
2024
2025
2026inline bool v_check_all(const v_uint8x16& a)
2027{ return v_check_all(v_reinterpret_as_s8(a)); }
2028inline bool v_check_any(const v_uint8x16& a)
2029{ return v_check_any(v_reinterpret_as_s8(a)); }
2030
2031inline bool v_check_all(const v_uint16x8& a)
2032{ return v_check_all(v_reinterpret_as_s16(a)); }
2033inline bool v_check_any(const v_uint16x8& a)
2034{ return v_check_any(v_reinterpret_as_s16(a)); }
2035
2036inline bool v_check_all(const v_uint32x4& a)
2037{ return v_check_all(v_reinterpret_as_s32(a)); }
2038inline bool v_check_any(const v_uint32x4& a)
2039{ return v_check_any(v_reinterpret_as_s32(a)); }
2040
2041inline bool v_check_all(const v_float32x4& a)
2042{ return v_check_all(v_reinterpret_as_s32(a)); }
2043inline bool v_check_any(const v_float32x4& a)
2044{ return v_check_any(v_reinterpret_as_s32(a)); }
2045
2046inline bool v_check_all(const v_uint64x2& a)
2047{ return v_check_all(v_reinterpret_as_s64(a)); }
2048inline bool v_check_any(const v_uint64x2& a)
2049{ return v_check_any(v_reinterpret_as_s64(a)); }
2050
2051#if CV_SIMD128_64F
2052inline bool v_check_all(const v_float64x2& a)
2053{ return v_check_all(v_reinterpret_as_s64(a)); }
2054inline bool v_check_any(const v_float64x2& a)
2055{ return v_check_any(v_reinterpret_as_s64(a)); }
2056#endif
2057#endif
2059
2060#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
2061inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
2062{ \
2063 return v_max(a, b) - v_min(a, b); \
2064}
2065
2066OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16, absdiff)
2067OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16x8, absdiff)
2068OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32x4, absdiff)
2069OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32x4, absdiff)
2070#if CV_SIMD128_64F
2071OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
2072#endif
2073OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
2074OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
2075
2076#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(ivec, uvec, itype, utype, isuf, usuf, vlen) \
2077inline uvec v_absdiff(const ivec& a, const ivec& b) \
2078{ \
2079 itype max = vmax_vv_##isuf(a, b, vlen); \
2080 itype min = vmin_vv_##isuf(a, b, vlen); \
2081 return uvec(vreinterpret_v_##isuf##_##usuf(vsub_vv_##isuf(max, min, vlen))); \
2082}
2083
2084OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vint8m1_t, vuint8m1_t, i8m1, u8m1, 16)
2085OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vint16m1_t, vuint16m1_t, i16m1, u16m1, 8)
2086OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vint32m1_t, vuint32m1_t, i32m1, u32m1, 4)
2087
2088#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
2089inline _Tprvec v_abs(const _Tpvec& a) \
2090{ \
2091 return v_absdiff(a, v_setzero_##suffix()); \
2092}
2093
2094OPENCV_HAL_IMPL_RVV_ABS(v_uint8x16, v_int8x16, s8)
2095OPENCV_HAL_IMPL_RVV_ABS(v_uint16x8, v_int16x8, s16)
2096OPENCV_HAL_IMPL_RVV_ABS(v_uint32x4, v_int32x4, s32)
2097OPENCV_HAL_IMPL_RVV_ABS(v_float32x4, v_float32x4, f32)
2098#if CV_SIMD128_64F
2099OPENCV_HAL_IMPL_RVV_ABS(v_float64x2, v_float64x2, f64)
2100#endif
2101
2102
2103#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
2104inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
2105{ \
2106 return v_reduce_sum(v_absdiff(a, b)); \
2107}
2108
2109OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8x16, unsigned)
2110OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8x16, unsigned)
2111OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16x8, unsigned)
2112OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16x8, unsigned)
2113OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32x4, unsigned)
2114OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32x4, unsigned)
2115OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float)
2116
2117
2118
2119#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, vl) \
2120inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
2121{ \
2122 return _Tpvec(merge(ne(mask, 0, vl), b, a, vl)); \
2123}
2124
2125OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 16)
2126OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 16)
2127OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 8)
2128OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 8)
2129OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 4)
2130OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 4)
2131OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 4)
2132#if CV_SIMD128_64F
2133OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 2)
2134#endif
2135
2137
2138#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
2139template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
2140{ \
2141 return _Tpvec(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
2142} \
2143template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
2144{ \
2145 return _Tpvec(vslideup_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
2146} \
2147template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
2148{ return a; } \
2149template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
2150{ \
2151 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
2152} \
2153template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
2154{ \
2155 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
2156} \
2157template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
2158{ CV_UNUSED(b); return a; }
2159
2160OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8x16, u8, 16)
2161OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8x16, i8, 16)
2162OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16x8, u16, 8)
2163OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16x8, i16, 8)
2164OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32x4, u32, 4)
2165OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32x4, i32, 4)
2166OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64x2, u64, 2)
2167OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64x2, i64, 2)
2168
2169#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
2170template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
2171{ \
2172 return _Tpvec(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
2173} \
2174template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
2175{ \
2176 return _Tpvec(vslideup_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
2177} \
2178template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
2179{ return a; } \
2180template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
2181{ \
2182 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
2183} \
2184template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
2185{ \
2186 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
2187} \
2188template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
2189{ CV_UNUSED(b); return a; }
2190
2191OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32x4, f32, 4)
2192#if CV_SIMD128_64F
2193OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64x2, f64, 2)
2194#endif
2195
2197
2198inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2199{
2200 return v_float32x4(vfcvt_f_x_v_f32m1(a, 4));
2201}
2202
2203#if CV_SIMD128_64F
2204#ifndef __clang__
2205inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2206{
2207 double arr[4] = {a.val[0], a.val[1], 0, 0};
2208 vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2209 return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
2210}
2211
2212inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2213{
2214 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2215 vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2216 return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
2217}
2218#else
2219inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2220{
2221 vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
2222 return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
2223}
2224inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2225{
2226 vfloat64m2_t dst = vlmul_ext_v_f64m1_f64m2(a);
2227 return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(dst, 1, b), 4));
2228}
2229#endif
2230
2231inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2232{
2233 double ptr[4] = {0};
2234 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
2235 double elems[2] =
2236 {
2237 ptr[0], ptr[1]
2238 };
2239 return v_float64x2(vle64_v_f64m1(elems, 2));
2240}
2241
2242inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2243{
2244 double ptr[4] = {0};
2245 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
2246 double elems[2] =
2247 {
2248 ptr[2], ptr[3]
2249 };
2250 return v_float64x2(vle64_v_f64m1(elems, 2));
2251}
2252
2253inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2254{
2255 double ptr[4] = {0};
2256 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
2257 double elems[2] =
2258 {
2259 ptr[0], ptr[1]
2260 };
2261 return v_float64x2(vle64_v_f64m1(elems, 2));
2262}
2263
2264inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2265{
2266 double ptr[4] = {0};
2267 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
2268 double elems[2] =
2269 {
2270 ptr[2], ptr[3]
2271 };
2272 return v_float64x2(vle64_v_f64m1(elems, 2));
2273}
2274
2275inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2276{
2277 return v_float64x2(vfcvt_f_x_v_f64m1(a, 2));
2278}
2279#endif
2280
2282
2283#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
2284template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
2285{ \
2286 return v_setall_##suffix(v_extract_n<i>(v)); \
2287}
2288
2289OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16, u8)
2290OPENCV_HAL_IMPL_RVV_BROADCAST(v_int8x16, s8)
2291OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint16x8, u16)
2292OPENCV_HAL_IMPL_RVV_BROADCAST(v_int16x8, s16)
2293OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32x4, u32)
2294OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32x4, s32)
2295OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint64x2, u64)
2296OPENCV_HAL_IMPL_RVV_BROADCAST(v_int64x2, s64)
2297OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32x4, f32)
2298#if CV_SIMD128_64F
2299OPENCV_HAL_IMPL_RVV_BROADCAST(v_float64x2, f64)
2300#endif
2301
2303
2304#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
2305inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
2306 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
2307 v_##_Tpvec& b0, v_##_Tpvec& b1, \
2308 v_##_Tpvec& b2, v_##_Tpvec& b3) \
2309{ \
2310 _Tp elems0[4] = \
2311 { \
2312 v_extract_n<0>(a0), \
2313 v_extract_n<0>(a1), \
2314 v_extract_n<0>(a2), \
2315 v_extract_n<0>(a3) \
2316 }; \
2317 b0 = v_load(elems0); \
2318 _Tp elems1[4] = \
2319 { \
2320 v_extract_n<1>(a0), \
2321 v_extract_n<1>(a1), \
2322 v_extract_n<1>(a2), \
2323 v_extract_n<1>(a3) \
2324 }; \
2325 b1 = v_load(elems1); \
2326 _Tp elems2[4] = \
2327 { \
2328 v_extract_n<2>(a0), \
2329 v_extract_n<2>(a1), \
2330 v_extract_n<2>(a2), \
2331 v_extract_n<2>(a3) \
2332 }; \
2333 b2 = v_load(elems2); \
2334 _Tp elems3[4] = \
2335 { \
2336 v_extract_n<3>(a0), \
2337 v_extract_n<3>(a1), \
2338 v_extract_n<3>(a2), \
2339 v_extract_n<3>(a3) \
2340 }; \
2341 b3 = v_load(elems3); \
2342}
2343
2344OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4, unsigned, u32)
2345OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4, int, i32)
2346OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
2347
2348
2349
2350#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, suffix) \
2351inline _Tpvec v_reverse(const _Tpvec& a) \
2352{ \
2353 _Tp ptr[_Tpvec::nlanes] = {0}; \
2354 _Tp ptra[_Tpvec::nlanes] = {0}; \
2355 v_store(ptra, a); \
2356 for (int i = 0; i < _Tpvec::nlanes; i++) \
2357 { \
2358 ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
2359 } \
2360 return v_load(ptr); \
2361}
2362
2363OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, u8)
2364OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, i8)
2365OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, u16)
2366OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, i16)
2367OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, u32)
2368OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, i32)
2369OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, f32)
2370OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, u64)
2371OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, i64)
2372#if CV_SIMD128_64F
2373OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, f64)
2374#endif
2375
2377
2378#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt, vl) \
2379inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
2380{ \
2381 _Tp lptr[_Tpvec::nlanes/2] = {0}; \
2382 _Tp hptr[_Tpvec::nlanes/2] = {0}; \
2383 v_store_low(lptr, a); \
2384 v_store_high(hptr, a); \
2385 b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
2386 b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
2387} \
2388inline _Tpwvec v_expand_low(const _Tpvec& a) \
2389{ \
2390 _Tp lptr[_Tpvec::nlanes/2] = {0}; \
2391 v_store_low(lptr, a); \
2392 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
2393} \
2394inline _Tpwvec v_expand_high(const _Tpvec& a) \
2395{ \
2396 _Tp hptr[_Tpvec::nlanes/2] = {0}; \
2397 v_store_high(hptr, a); \
2398 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
2399} \
2400inline _Tpwvec v_load_expand(const _Tp* ptr) \
2401{ \
2402 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr, vl), vl)); \
2403}
2404
2405OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1, 8)
2406OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1, 8)
2407OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1, 4)
2408OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1, 4)
2409OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1, 2)
2410OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1, 2)
2411
2412inline v_uint32x4 v_load_expand_q(const uchar* ptr)
2413{
2414 return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr, 4), 4), 4));
2415}
2416
2417inline v_int32x4 v_load_expand_q(const schar* ptr)
2418{
2419 return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr, 4), 4), 4));
2420}
2421
2422
2423#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, shr, hvl, vl) \
2424inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
2425{ \
2426 _wTp arr[_Tpvec::nlanes] = {0}; \
2427 v_store(arr, a); \
2428 v_store(arr + _wTpvec::nlanes, b); \
2429 return _Tpvec(shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl)); \
2430} \
2431inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
2432{ \
2433 _wTp arr[_Tpvec::nlanes] = {0}; \
2434 v_store(arr, a); \
2435 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2436 vse##hwidth##_v_##hsuffix##m1(ptr, shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl), hvl); \
2437} \
2438template<int n> inline \
2439_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
2440{ \
2441 _wTp arr[_Tpvec::nlanes] = {0}; \
2442 v_store(arr, a); \
2443 v_store(arr + _wTpvec::nlanes, b); \
2444 return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)); \
2445} \
2446template<int n> inline \
2447void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
2448{ \
2449 _wTp arr[_Tpvec::nlanes] = {0}; \
2450 v_store(arr, a); \
2451 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2452 vse##hwidth##_v_##hsuffix##m1(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)), hvl); \
2453}
2454
2455OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 8, 16, u8, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1, 8, 16)
2456OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 8, 16, i8, i16, vnclip_wx_i8m1, vnclip_wx_i8m1, 8, 16)
2457OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 16, 32, u16, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1, 4, 8)
2458OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 16, 32, i16, i32, vnclip_wx_i16m1, vnclip_wx_i16m1, 4, 8)
2459OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 32, 64, u32, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1, 2, 4)
2460OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 32, 64, i32, i64, vnclip_wx_i32m1, vnsra_wx_i32m1, 2, 4)
2461
2462
2463#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
2464inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
2465{ \
2466 _wTp arr[_Tpvec::nlanes] = {0}; \
2467 v_store(arr, a); \
2468 v_store(arr + _wTpvec::nlanes, b); \
2469 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl)); \
2470} \
2471inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2472{ \
2473 _wTp arr[_Tpvec::nlanes] = {0}; \
2474 v_store(arr, a); \
2475 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2476 vse##hwidth##_v_##hsuffix##m1(ptr, rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl), hvl); \
2477} \
2478template<int n> inline \
2479_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
2480{ \
2481 _wTp arr[_Tpvec::nlanes] = {0}; \
2482 v_store(arr, a); \
2483 v_store(arr + _wTpvec::nlanes, b); \
2484 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl)); \
2485} \
2486template<int n> inline \
2487void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2488{ \
2489 _wTp arr[_Tpvec::nlanes] = {0}; \
2490 v_store(arr, a); \
2491 v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
2492 v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl))); \
2493}
2494
2495OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, 8, 16)
2496OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, 4, 8)
2497
2498
2499#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, suffix) \
2500inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2501{ \
2502 _Tp ptra0[v_##_Tpvec::nlanes] = {0}; \
2503 _Tp ptra1[v_##_Tpvec::nlanes] = {0}; \
2504 _Tp ptrb0[v_##_Tpvec::nlanes] = {0}; \
2505 _Tp ptrb1[v_##_Tpvec::nlanes] = {0}; \
2506 v_store(ptra0, a0); \
2507 v_store(ptra1, a1); \
2508 int i; \
2509 for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
2510 { \
2511 ptrb0[i*2] = ptra0[i]; \
2512 ptrb0[i*2+1] = ptra1[i]; \
2513 } \
2514 for( ; i < v_##_Tpvec::nlanes; i++ ) \
2515 { \
2516 ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
2517 ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
2518 } \
2519 b0 = v_load(ptrb0); \
2520 b1 = v_load(ptrb1); \
2521} \
2522inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2523{ \
2524 _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
2525 _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2526 v_store_low(ptra, a); \
2527 v_store_low(ptrb, b); \
2528 return v_load_halves(ptra, ptrb); \
2529} \
2530inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2531{ \
2532 _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
2533 _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2534 v_store_high(ptra, a); \
2535 v_store_high(ptrb, b); \
2536 return v_load_halves(ptra, ptrb); \
2537} \
2538inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2539{ \
2540 c = v_combine_low(a, b); \
2541 d = v_combine_high(a, b); \
2542}
2543
2544OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, u8)
2545OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, i8)
2546OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, u16)
2547OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, i16)
2548OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, u32)
2549OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, i32)
2550OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, f32)
2551#if CV_SIMD128_64F
2552OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, f64)
2553#endif
2554
2555
2556#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp) \
2557inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2558{ \
2559 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2560 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2561 int i, i2; \
2562 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2563 { \
2564 ptra[i] = ptr[i2]; \
2565 ptrb[i] = ptr[i2+1]; \
2566 } \
2567 a = v_load(ptra); \
2568 b = v_load(ptrb); \
2569} \
2570inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2571{ \
2572 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2573 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2574 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2575 int i, i3; \
2576 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2577 { \
2578 ptra[i] = ptr[i3]; \
2579 ptrb[i] = ptr[i3+1]; \
2580 ptrc[i] = ptr[i3+2]; \
2581 } \
2582 a = v_load(ptra); \
2583 b = v_load(ptrb); \
2584 c = v_load(ptrc); \
2585} \
2586inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2587 v_##_Tpvec& c, v_##_Tpvec& d) \
2588{ \
2589 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2590 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2591 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2592 _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
2593 int i, i4; \
2594 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2595 { \
2596 ptra[i] = ptr[i4]; \
2597 ptrb[i] = ptr[i4+1]; \
2598 ptrc[i] = ptr[i4+2]; \
2599 ptrd[i] = ptr[i4+3]; \
2600 } \
2601 a = v_load(ptra); \
2602 b = v_load(ptrb); \
2603 c = v_load(ptrc); \
2604 d = v_load(ptrd); \
2605} \
2606inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2607 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2608{ \
2609 int i, i2; \
2610 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2611 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2612 v_store(ptra, a); \
2613 v_store(ptrb, b); \
2614 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2615 { \
2616 ptr[i2] = ptra[i]; \
2617 ptr[i2+1] = ptrb[i]; \
2618 } \
2619} \
2620inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2621 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2622{ \
2623 int i, i3; \
2624 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2625 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2626 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2627 v_store(ptra, a); \
2628 v_store(ptrb, b); \
2629 v_store(ptrc, c); \
2630 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2631 { \
2632 ptr[i3] = ptra[i]; \
2633 ptr[i3+1] = ptrb[i]; \
2634 ptr[i3+2] = ptrc[i]; \
2635 } \
2636} \
2637inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2638 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2639 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2640{ \
2641 int i, i4; \
2642 _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
2643 _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
2644 _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
2645 _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
2646 v_store(ptra, a); \
2647 v_store(ptrb, b); \
2648 v_store(ptrc, c); \
2649 v_store(ptrd, d); \
2650 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2651 { \
2652 ptr[i4] = ptra[i]; \
2653 ptr[i4+1] = ptrb[i]; \
2654 ptr[i4+2] = ptrc[i]; \
2655 ptr[i4+3] = ptrd[i]; \
2656 } \
2657} \
2658inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
2659{ \
2660 _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
2661 _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
2662 v_store(ptrvec, vec); \
2663 for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
2664 { \
2665 ptr[4*i ] = ptrvec[4*i ]; \
2666 ptr[4*i+1] = ptrvec[4*i+2]; \
2667 ptr[4*i+2] = ptrvec[4*i+1]; \
2668 ptr[4*i+3] = ptrvec[4*i+3]; \
2669 } \
2670 return v_load(ptr); \
2671} \
2672inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
2673{ \
2674 _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
2675 _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
2676 v_store(ptrvec, vec); \
2677 for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
2678 { \
2679 ptr[8*i ] = ptrvec[8*i ]; \
2680 ptr[8*i+1] = ptrvec[8*i+4]; \
2681 ptr[8*i+2] = ptrvec[8*i+1]; \
2682 ptr[8*i+3] = ptrvec[8*i+5]; \
2683 ptr[8*i+4] = ptrvec[8*i+2]; \
2684 ptr[8*i+5] = ptrvec[8*i+6]; \
2685 ptr[8*i+6] = ptrvec[8*i+3]; \
2686 ptr[8*i+7] = ptrvec[8*i+7]; \
2687 } \
2688 return v_load(ptr); \
2689}
2690
2691OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar)
2692OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar)
2693OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort)
2694OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short)
2695OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned)
2696OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int)
2697OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float)
2698OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64)
2699OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64)
2700#if CV_SIMD128_64F
2701OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double)
2702#endif
2703
2705
2706static const unsigned char popCountTable[] =
2707{
2708 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2709 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2710 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2711 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2712 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2713 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2714 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2715 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2716 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2717 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2718 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2719 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2720 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2721 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2722 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2723 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2724};
2725
2726#define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
2727inline _rTpvec v_popcount(const _Tpvec& a) \
2728{ \
2729 uchar ptra[16] = {0}; \
2730 v_store(ptra, v_reinterpret_as_u8(a)); \
2731 _rTp ptr[_Tpvec::nlanes] = {0}; \
2732 v_store(ptr, v_setzero_##suffix()); \
2733 for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
2734 ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
2735 return v_load(ptr); \
2736}
2737
2738OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_uint8x16, uchar, uchar, u8)
2739OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_int8x16, uchar, schar, u8)
2740OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_uint16x8, ushort, ushort, u16)
2741OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_int16x8, ushort, short, u16)
2742OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_uint32x4, unsigned, unsigned, u32)
2743OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_int32x4, unsigned, int, u32)
2744OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_uint64x2, uint64, uint64, u64)
2745OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64)
2746
2747
2748
2749#ifndef __clang__
2750#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, vl, shift) \
2751inline int v_signmask(const _Tpvec& a) \
2752{ \
2753 int mask = 0; \
2754 _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift, vl)); \
2755 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
2756 mask |= (int)(tmp.val[i]) << i; \
2757 return mask; \
2758}
2759
2760OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 16, 7)
2761OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 8, 15)
2762OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 4, 31)
2763OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 2, 63)
2764
2765inline int v_signmask(const v_int8x16& a)
2766{ return v_signmask(v_reinterpret_as_u8(a)); }
2767inline int v_signmask(const v_int16x8& a)
2768{ return v_signmask(v_reinterpret_as_u16(a)); }
2769inline int v_signmask(const v_int32x4& a)
2770{ return v_signmask(v_reinterpret_as_u32(a)); }
2771inline int v_signmask(const v_float32x4& a)
2772{ return v_signmask(v_reinterpret_as_u32(a)); }
2773inline int v_signmask(const v_int64x2& a)
2774{ return v_signmask(v_reinterpret_as_u64(a)); }
2775#if CV_SIMD128_64F
2776inline int v_signmask(const v_float64x2& a)
2777{ return v_signmask(v_reinterpret_as_u64(a)); }
2778#endif
2779
2780#else
2781#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, width, vl) \
2782inline int v_signmask(const _Tpvec& a) \
2783{ \
2784 uint8_t ans[16] = {0};\
2785 vsm(ans, vmslt(a, 0, vl), vl);\
2786 return reinterpret_cast<int*>(ans)[0] & ((1 << (vl)) - 1);\
2787}
2788
2789OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8x16, 8, 16)
2790OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16x8, 16, 8)
2791OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32x4, 32, 4)
2792OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64x2, 64, 2)
2793
2794inline int v_signmask(const v_uint8x16& a)
2795{ return v_signmask(v_reinterpret_as_s8(a)); }
2796inline int v_signmask(const v_uint16x8& a)
2797{ return v_signmask(v_reinterpret_as_s16(a)); }
2798inline int v_signmask(const v_uint32x4& a)
2799{ return v_signmask(v_reinterpret_as_s32(a)); }
2800inline int v_signmask(const v_float32x4& a)
2801{ return v_signmask(v_reinterpret_as_s32(a)); }
2802inline int v_signmask(const v_uint64x2& a)
2803{ return v_signmask(v_reinterpret_as_s64(a)); }
2804#if CV_SIMD128_64F
2805inline int v_signmask(const v_float64x2& a)
2806{ return v_signmask(v_reinterpret_as_s64(a)); }
2807#endif
2808
2809#endif
2810
2812
2813#define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
2814inline int v_scan_forward(const _Tpvec& a) \
2815{ \
2816 _Tp ptr[_Tpvec::nlanes] = {0}; \
2817 v_store(ptr, v_reinterpret_as_##suffix(a)); \
2818 for (int i = 0; i < _Tpvec::nlanes; i++) \
2819 if(int(ptr[i]) < 0) \
2820 return i; \
2821 return 0; \
2822}
2823
2824OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16, uchar, u8)
2825OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int8x16, schar, s8)
2826OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint16x8, ushort, u16)
2827OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int16x8, short, s16)
2828OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint32x4, unsigned, u32)
2829OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int32x4, int, s32)
2830OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float32x4, float, f32)
2831OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint64x2, uint64, u64)
2832OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int64x2, int64, s64)
2833#if CV_SIMD128_64F
2834OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
2835#endif
2836
2838
2839inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2840{
2841 const uint64 ptr[2] = {0x0908060504020100, 0xFFFFFF0F0E0D0C0A};
2842 const v_uint64x2 flags(vle64_v_u64m1(ptr, 2));
2843 return v_reinterpret_as_s8(v_uint8x16(
2844 vrgather_vv_u8m1(
2845 v_reinterpret_as_u8(vec),
2846 v_reinterpret_as_u8(flags),
2847 16)));
2848}
2849inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
2850{
2851 return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec)));
2852}
2853
2854inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2855{
2856 const uint64 ptr[2] = {0x0908050403020100, 0xFFFF0F0E0D0C0B0A};
2857 const v_uint64x2 flags(vle64_v_u64m1(ptr, 2));
2858 return v_reinterpret_as_s16(v_uint8x16(
2859 vrgather_vv_u8m1(
2860 v_reinterpret_as_u8(vec),
2861 v_reinterpret_as_u8(flags),
2862 16)));
2863}
2864inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
2865{
2866 return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec)));
2867}
2868
2869inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
2870inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
2871inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2872
2874
2875#if CV_FP16
2876inline v_float32x4 v_load_expand(const hfloat* ptr)
2877{
2878 return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
2879}
2880
2881inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2882{
2883 vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
2884}
2885#else
2886inline v_float32x4 v_load_expand(const hfloat* ptr)
2887{
2888 const int N = 4;
2889 float buf[N];
2890 for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2891 return v_load(buf);
2892}
2893
2894inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
2895{
2896 const int N = 4;
2897 float buf[N];
2898 v_store(buf, v);
2899 for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
2900}
2901#endif
2902
2904
2905inline v_int32x4 v_round(const v_float32x4& a)
2906{
2907 return v_int32x4(vfcvt_x_f_v_i32m1(a, 4));
2908}
2909
2910inline v_int32x4 v_floor(const v_float32x4& a)
2911{
2912 v_float32x4 ZP5 = v_setall_f32(0.5f);
2913 v_float32x4 t = a - ZP5;
2914 return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
2915}
2916
2917inline v_int32x4 v_ceil(const v_float32x4& a)
2918{
2919 v_float32x4 ZP5 = v_setall_f32(0.5f);
2920 v_float32x4 t = a + ZP5;
2921 return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
2922}
2923
2924inline v_int32x4 v_trunc(const v_float32x4& a)
2925{
2926#ifndef CV_RVV_THEAD_0_7
2927 return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a, 4));
2928#else
2929 const int old_round = fesetround(FE_TOWARDZERO);
2930 vint32m1_t val = vfcvt_x_f_v_i32m1(a, 4);
2931 fesetround(old_round);
2932 return v_int32x4(val);
2933#endif
2934}
2935#if CV_SIMD128_64F
2936#ifndef __clang__
2937inline v_int32x4 v_round(const v_float64x2& a)
2938{
2939 double arr[4] = {a.val[0], a.val[1], 0, 0};
2940 vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2941 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2942}
2943
2944inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2945{
2946 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2947 vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2948 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2949}
2950
2951inline v_int32x4 v_floor(const v_float64x2& a)
2952{
2953 double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
2954 vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2955 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2956}
2957
2958inline v_int32x4 v_ceil(const v_float64x2& a)
2959{
2960 double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
2961 vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2962 return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
2963}
2964
2965inline v_int32x4 v_trunc(const v_float64x2& a)
2966{
2967 double arr[4] = {a.val[0], a.val[1], 0, 0};
2968 vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
2969#ifndef CV_RVV_THEAD_0_7
2970 return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp, 4));
2971#else
2972 const int old_round = fesetround(FE_TOWARDZERO);
2973 vint32m1_t val = vfncvt_x_f_w_i32m1(tmp, 4);
2974 fesetround(old_round);
2975 return v_int32x4(val);
2976#endif
2977}
2978
2979#else
2980inline v_int32x4 v_round(const v_float64x2& a)
2981{
2982 vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
2983 return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
2984}
2985
2986inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2987{
2988 vfloat64m2_t dst = vlmul_ext_v_f64m1_f64m2(a);
2989 return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(dst, 1, b), 4));
2990}
2991
2992inline v_int32x4 v_floor(const v_float64x2& a)
2993{
2994 vfloat64m2_t dst = vfmv_v_f_f64m2(0, 4);
2995 dst = vset_v_f64m1_f64m2(dst, 0, a);
2996 dst = vfsub_vf_f64m2(dst, 0.5, 2);
2997 return v_int32x4(vfncvt_x_f_w_i32m1(dst, 4));
2998}
2999
3000inline v_int32x4 v_ceil(const v_float64x2& a)
3001{
3002 vfloat64m2_t dst = vfmv_v_f_f64m2(0, 4);
3003 dst = vset_v_f64m1_f64m2(dst, 0, a);
3004 dst = vfadd_vf_f64m2(dst, 0.5, 2);
3005 return v_int32x4(vfncvt_x_f_w_i32m1(dst, 4));
3006}
3007
3008inline v_int32x4 v_trunc(const v_float64x2& a)
3009{
3010 vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
3011 return v_int32x4(vfncvt_rtz_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
3012}
3013#endif
3014#endif
3015
3016
3018
3019// 16 >> 32
3020inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
3021{
3022 int ptr[8] = {0};
3023 v_int32x4 t1, t2;
3024 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3025 v_load_deinterleave(ptr, t1, t2);
3026 return t1 + t2;
3027}
3028inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
3029{
3030 int ptr[8] = {0};
3031 v_int32x4 t1, t2;
3032 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3033 v_load_deinterleave(ptr, t1, t2);
3034 return t1 + t2 + c;
3035}
3036
3037// 32 >> 64
3038inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
3039{
3040 int64 ptr[4] = {0};
3041 v_int64x2 t1, t2;
3042 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3043 v_load_deinterleave(ptr, t1, t2);
3044 return t1 + t2;
3045}
3046inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
3047{
3048 int64 ptr[4] = {0};
3049 v_int64x2 t1, t2;
3050 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3051 v_load_deinterleave(ptr, t1, t2);
3052 return t1 + t2 + c;
3053}
3054
3055// 8 >> 32
3056inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
3057{
3058 unsigned ptr[16] = {0};
3059 v_uint32x4 t1, t2, t3, t4;
3060 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3061 v_load_deinterleave(ptr, t1, t2, t3, t4);
3062 return t1 + t2 + t3 + t4;
3063}
3064inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
3065 const v_uint32x4& c)
3066{
3067 unsigned ptr[16] = {0};
3068 v_uint32x4 t1, t2, t3, t4;
3069 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3070 v_load_deinterleave(ptr, t1, t2, t3, t4);
3071 return t1 + t2 + t3 + t4 + c;
3072}
3073
3074inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
3075{
3076 int ptr[16] = {0};
3077 v_int32x4 t1, t2, t3, t4;
3078 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3079 v_load_deinterleave(ptr, t1, t2, t3, t4);
3080 return t1 + t2 + t3 + t4;
3081}
3082inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
3083 const v_int32x4& c)
3084{
3085 int ptr[16] = {0};
3086 v_int32x4 t1, t2, t3, t4;
3087 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3088 v_load_deinterleave(ptr, t1, t2, t3, t4);
3089 return t1 + t2 + t3 + t4 + c;
3090}
3091
3092// 16 >> 64
3093inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
3094{
3095 uint64 ptr[8] = {0};
3096 v_uint64x2 t1, t2, t3, t4;
3097 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3098 v_load_deinterleave(ptr, t1, t2, t3, t4);
3099 return t1 + t2 + t3 + t4;
3100}
3101inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
3102{
3103 uint64 ptr[8] = {0};
3104 v_uint64x2 t1, t2, t3, t4;
3105 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3106 v_load_deinterleave(ptr, t1, t2, t3, t4);
3107 return t1 + t2 + t3 + t4 + c;
3108}
3109
3110inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
3111{
3112 int64 ptr[8] = {0};
3113 v_int64x2 t1, t2, t3, t4;
3114 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3115 v_load_deinterleave(ptr, t1, t2, t3, t4);
3116 return t1 + t2 + t3 + t4;
3117}
3118inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
3119 const v_int64x2& c)
3120{
3121 int64 ptr[8] = {0};
3122 v_int64x2 t1, t2, t3, t4;
3123 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3124 v_load_deinterleave(ptr, t1, t2, t3, t4);
3125 return t1 + t2 + t3 + t4 + c;
3126}
3127
3128// 32 >> 64f
3129#if CV_SIMD128_64F
3130inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
3131{ return v_cvt_f64(v_dotprod(a, b)); }
3132inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
3133 const v_float64x2& c)
3134{ return v_dotprod_expand(a, b) + c; }
3135#endif
3136
3138
3139// 16 >> 32
3140inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
3141{
3142 int ptr[8] = {0};
3143 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3144 v_int32x4 t1 = v_load(ptr);
3145 v_int32x4 t2 = v_load(ptr+4);
3146 return t1 + t2;
3147}
3148inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
3149{
3150 int ptr[8] = {0};
3151 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
3152 v_int32x4 t1 = v_load(ptr);
3153 v_int32x4 t2 = v_load(ptr+4);
3154 return t1 + t2 + c;
3155}
3156
3157// 32 >> 64
3158inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
3159{
3160 int64 ptr[4] = {0};
3161 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3162 v_int64x2 t1 = v_load(ptr);
3163 v_int64x2 t2 = v_load(ptr+2);
3164 return t1 + t2;
3165}
3166inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
3167{
3168 int64 ptr[4] = {0};
3169 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
3170 v_int64x2 t1 = v_load(ptr);
3171 v_int64x2 t2 = v_load(ptr+2);
3172 return t1 + t2 + c;
3173}
3174
3175
3176// 8 >> 32
3177inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
3178{
3179 unsigned ptr[16] = {0};
3180 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3181 v_uint32x4 t1 = v_load(ptr);
3182 v_uint32x4 t2 = v_load(ptr+4);
3183 v_uint32x4 t3 = v_load(ptr+8);
3184 v_uint32x4 t4 = v_load(ptr+12);
3185 return t1 + t2 + t3 + t4;
3186}
3187inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
3188{
3189 unsigned ptr[16] = {0};
3190 vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
3191 v_uint32x4 t1 = v_load(ptr);
3192 v_uint32x4 t2 = v_load(ptr+4);
3193 v_uint32x4 t3 = v_load(ptr+8);
3194 v_uint32x4 t4 = v_load(ptr+12);
3195 return t1 + t2 + t3 + t4 + c;
3196}
3197inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
3198{
3199 int ptr[16] = {0};
3200 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3201 v_int32x4 t1 = v_load(ptr);
3202 v_int32x4 t2 = v_load(ptr+4);
3203 v_int32x4 t3 = v_load(ptr+8);
3204 v_int32x4 t4 = v_load(ptr+12);
3205 return t1 + t2 + t3 + t4;
3206}
3207inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
3208{
3209 int ptr[16] = {0};
3210 vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
3211 v_int32x4 t1 = v_load(ptr);
3212 v_int32x4 t2 = v_load(ptr+4);
3213 v_int32x4 t3 = v_load(ptr+8);
3214 v_int32x4 t4 = v_load(ptr+12);
3215 return t1 + t2 + t3 + t4 + c;
3216}
3217
3218// 16 >> 64
3219inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
3220{
3221 uint64 ptr[8] = {0};
3222 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3223 v_uint64x2 t1 = v_load(ptr);
3224 v_uint64x2 t2 = v_load(ptr+2);
3225 v_uint64x2 t3 = v_load(ptr+4);
3226 v_uint64x2 t4 = v_load(ptr+6);
3227 return t1 + t2 + t3 + t4;
3228}
3229inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
3230{
3231 uint64 ptr[8] = {0};
3232 vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
3233 v_uint64x2 t1 = v_load(ptr);
3234 v_uint64x2 t2 = v_load(ptr+2);
3235 v_uint64x2 t3 = v_load(ptr+4);
3236 v_uint64x2 t4 = v_load(ptr+6);
3237 return t1 + t2 + t3 + t4 + c;
3238}
3239inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
3240{
3241 int64 ptr[8] = {0};
3242 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3243 v_int64x2 t1 = v_load(ptr);
3244 v_int64x2 t2 = v_load(ptr+2);
3245 v_int64x2 t3 = v_load(ptr+4);
3246 v_int64x2 t4 = v_load(ptr+6);
3247 return t1 + t2 + t3 + t4;
3248}
3249inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
3250{
3251 int64 ptr[8] = {0};
3252 vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
3253 v_int64x2 t1 = v_load(ptr);
3254 v_int64x2 t2 = v_load(ptr+2);
3255 v_int64x2 t3 = v_load(ptr+4);
3256 v_int64x2 t4 = v_load(ptr+6);
3257 return t1 + t2 + t3 + t4 + c;
3258}
3259
3260// 32 >> 64f
3261#if CV_SIMD128_64F
3262inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
3263{ return v_cvt_f64(v_dotprod_fast(a, b)); }
3264inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
3265{ return v_dotprod_expand_fast(a, b) + c; }
3266#endif
3267
3268
3269inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
3270 const v_float32x4& m1, const v_float32x4& m2,
3271 const v_float32x4& m3)
3272{
3273 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
3274 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
3275 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
3276 res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3, 4);
3277 return v_float32x4(res);
3278}
3279
3280inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
3281 const v_float32x4& m1, const v_float32x4& m2,
3282 const v_float32x4& a)
3283{
3284 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
3285 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
3286 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
3287 return v_float32x4(res) + a;
3288}
3289
3290#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width, vl, hvl) \
3291inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
3292{ \
3293 _Tpw ptr[_Tpwvec::nlanes*2] = {0}; \
3294 vse##width##_v_##suffix##m2(ptr, wmul(a, b, vl), vl); \
3295 c = _Tpwvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
3296 d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes, hvl)); \
3297}
3298
3299OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16, 16, 8)
3300OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16, 16, 8)
3301OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32, 8, 4)
3302OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32, 8, 4)
3303OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64, 4, 2)
3304
3305
3306inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
3307{
3308 return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b, 8), 16, 8));
3309}
3310inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
3311{
3312 return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b, 8), 16, 8));
3313}
3314
3315
3317
3318#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
3319inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
3320{ \
3321 _wTpvec c, d; \
3322 v_mul_expand(a, b, c, d); \
3323 return v_pack(c, d); \
3324} \
3325inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
3326{ \
3327 a = a * b; \
3328 return a; \
3329}
3330
3331OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16, v_uint16x8)
3332OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8x16, v_int16x8)
3333OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16x8, v_uint32x4)
3334OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16x8, v_int32x4)
3335
3336
3337inline void v_cleanup() {}
3338
3339CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3340
3342
3343} // namespace cv
3344
3345#endif
T fesetround(T... args)
InputArrayOfArrays Size InputOutputArray InputOutputArray OutputArrayOfArrays OutputArrayOfArrays OutputArray OutputArray OutputArray int flags
Definition calib3d.hpp:1617
CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst)
Calculates the per-element absolute difference between two arrays or between an array and a scalar.
const int * idx
Definition core_c.h:668
CvArr * arr
Definition core_c.h:1247
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition intrin_cpp.hpp:2190
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition intrin_cpp.hpp:1033
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition intrin_cpp.hpp:1020
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition intrin_cpp.hpp:1584
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition intrin_cpp.hpp:1961
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load_halves(const _Tp *loptr, const _Tp *hiptr)
Load register contents from two memory blocks.
Definition intrin_cpp.hpp:1781
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
OutputArray dst
Definition imgproc.hpp:3564
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441