73 #ifndef OPENCV_CUDA_SIMD_FUNCTIONS_HPP
74 #define OPENCV_CUDA_SIMD_FUNCTIONS_HPP
84 namespace cv {
namespace cuda {
namespace device
88 static __device__ __forceinline__
unsigned int vadd2(
unsigned int a,
unsigned int b)
92 #if __CUDA_ARCH__ >= 300
93 asm(
"vadd2.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
94 #elif __CUDA_ARCH__ >= 200
95 asm(
"vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
96 asm(
"vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
109 static __device__ __forceinline__
unsigned int vsub2(
unsigned int a,
unsigned int b)
113 #if __CUDA_ARCH__ >= 300
114 asm(
"vsub2.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
115 #elif __CUDA_ARCH__ >= 200
116 asm(
"vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
117 asm(
"vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
130 static __device__ __forceinline__
unsigned int vabsdiff2(
unsigned int a,
unsigned int b)
134 #if __CUDA_ARCH__ >= 300
135 asm(
"vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
136 #elif __CUDA_ARCH__ >= 200
137 asm(
"vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
138 asm(
"vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
140 unsigned int s, t, u, v;
157 static __device__ __forceinline__
unsigned int vavg2(
unsigned int a,
unsigned int b)
172 static __device__ __forceinline__
unsigned int vavrg2(
unsigned int a,
unsigned int b)
176 #if __CUDA_ARCH__ >= 300
177 asm(
"vavrg2.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
192 static __device__ __forceinline__
unsigned int vseteq2(
unsigned int a,
unsigned int b)
196 #if __CUDA_ARCH__ >= 300
197 asm(
"vset2.u32.u32.eq %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
213 static __device__ __forceinline__
unsigned int vcmpeq2(
unsigned int a,
unsigned int b)
217 #if __CUDA_ARCH__ >= 300
237 static __device__ __forceinline__
unsigned int vsetge2(
unsigned int a,
unsigned int b)
241 #if __CUDA_ARCH__ >= 300
242 asm(
"vset2.u32.u32.ge %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
245 asm(
"not.b32 %0, %0;" :
"+r"(b));
254 static __device__ __forceinline__
unsigned int vcmpge2(
unsigned int a,
unsigned int b)
258 #if __CUDA_ARCH__ >= 300
263 asm(
"not.b32 %0, %0;" :
"+r"(b));
274 static __device__ __forceinline__
unsigned int vsetgt2(
unsigned int a,
unsigned int b)
278 #if __CUDA_ARCH__ >= 300
279 asm(
"vset2.u32.u32.gt %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
282 asm(
"not.b32 %0, %0;" :
"+r"(b));
291 static __device__ __forceinline__
unsigned int vcmpgt2(
unsigned int a,
unsigned int b)
295 #if __CUDA_ARCH__ >= 300
300 asm(
"not.b32 %0, %0;" :
"+r"(b));
311 static __device__ __forceinline__
unsigned int vsetle2(
unsigned int a,
unsigned int b)
315 #if __CUDA_ARCH__ >= 300
316 asm(
"vset2.u32.u32.le %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
319 asm(
"not.b32 %0, %0;" :
"+r"(a));
328 static __device__ __forceinline__
unsigned int vcmple2(
unsigned int a,
unsigned int b)
332 #if __CUDA_ARCH__ >= 300
337 asm(
"not.b32 %0, %0;" :
"+r"(a));
348 static __device__ __forceinline__
unsigned int vsetlt2(
unsigned int a,
unsigned int b)
352 #if __CUDA_ARCH__ >= 300
353 asm(
"vset2.u32.u32.lt %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
356 asm(
"not.b32 %0, %0;" :
"+r"(a));
365 static __device__ __forceinline__
unsigned int vcmplt2(
unsigned int a,
unsigned int b)
369 #if __CUDA_ARCH__ >= 300
374 asm(
"not.b32 %0, %0;" :
"+r"(a));
385 static __device__ __forceinline__
unsigned int vsetne2(
unsigned int a,
unsigned int b)
389 #if __CUDA_ARCH__ >= 300
390 asm (
"vset2.u32.u32.ne %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
406 static __device__ __forceinline__
unsigned int vcmpne2(
unsigned int a,
unsigned int b)
410 #if __CUDA_ARCH__ >= 300
430 static __device__ __forceinline__
unsigned int vmax2(
unsigned int a,
unsigned int b)
434 #if __CUDA_ARCH__ >= 300
435 asm(
"vmax2.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
436 #elif __CUDA_ARCH__ >= 200
437 asm(
"vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
438 asm(
"vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
440 unsigned int s, t, u;
453 static __device__ __forceinline__
unsigned int vmin2(
unsigned int a,
unsigned int b)
457 #if __CUDA_ARCH__ >= 300
458 asm(
"vmin2.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
459 #elif __CUDA_ARCH__ >= 200
460 asm(
"vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
461 asm(
"vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
463 unsigned int s, t, u;
478 static __device__ __forceinline__
unsigned int vadd4(
unsigned int a,
unsigned int b)
482 #if __CUDA_ARCH__ >= 300
483 asm(
"vadd4.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
484 #elif __CUDA_ARCH__ >= 200
485 asm(
"vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
486 asm(
"vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
487 asm(
"vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
488 asm(
"vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
502 static __device__ __forceinline__
unsigned int vsub4(
unsigned int a,
unsigned int b)
506 #if __CUDA_ARCH__ >= 300
507 asm(
"vsub4.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
508 #elif __CUDA_ARCH__ >= 200
509 asm(
"vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
510 asm(
"vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
511 asm(
"vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
512 asm(
"vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
526 static __device__ __forceinline__
unsigned int vavg4(
unsigned int a,
unsigned int b)
541 static __device__ __forceinline__
unsigned int vavrg4(
unsigned int a,
unsigned int b)
545 #if __CUDA_ARCH__ >= 300
546 asm(
"vavrg4.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
561 static __device__ __forceinline__
unsigned int vseteq4(
unsigned int a,
unsigned int b)
565 #if __CUDA_ARCH__ >= 300
566 asm(
"vset4.u32.u32.eq %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
582 static __device__ __forceinline__
unsigned int vcmpeq4(
unsigned int a,
unsigned int b)
586 #if __CUDA_ARCH__ >= 300
606 static __device__ __forceinline__
unsigned int vsetle4(
unsigned int a,
unsigned int b)
610 #if __CUDA_ARCH__ >= 300
611 asm(
"vset4.u32.u32.le %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
614 asm(
"not.b32 %0, %0;" :
"+r"(a));
623 static __device__ __forceinline__
unsigned int vcmple4(
unsigned int a,
unsigned int b)
627 #if __CUDA_ARCH__ >= 300
632 asm(
"not.b32 %0, %0;" :
"+r"(a));
643 static __device__ __forceinline__
unsigned int vsetlt4(
unsigned int a,
unsigned int b)
647 #if __CUDA_ARCH__ >= 300
648 asm(
"vset4.u32.u32.lt %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
651 asm(
"not.b32 %0, %0;" :
"+r"(a));
660 static __device__ __forceinline__
unsigned int vcmplt4(
unsigned int a,
unsigned int b)
664 #if __CUDA_ARCH__ >= 300
669 asm(
"not.b32 %0, %0;" :
"+r"(a));
680 static __device__ __forceinline__
unsigned int vsetge4(
unsigned int a,
unsigned int b)
684 #if __CUDA_ARCH__ >= 300
685 asm(
"vset4.u32.u32.ge %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
688 asm(
"not.b32 %0, %0;" :
"+r"(b));
697 static __device__ __forceinline__
unsigned int vcmpge4(
unsigned int a,
unsigned int b)
701 #if __CUDA_ARCH__ >= 300
706 asm (
"not.b32 %0,%0;" :
"+r"(b));
717 static __device__ __forceinline__
unsigned int vsetgt4(
unsigned int a,
unsigned int b)
721 #if __CUDA_ARCH__ >= 300
722 asm(
"vset4.u32.u32.gt %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
725 asm(
"not.b32 %0, %0;" :
"+r"(b));
734 static __device__ __forceinline__
unsigned int vcmpgt4(
unsigned int a,
unsigned int b)
738 #if __CUDA_ARCH__ >= 300
743 asm(
"not.b32 %0, %0;" :
"+r"(b));
754 static __device__ __forceinline__
unsigned int vsetne4(
unsigned int a,
unsigned int b)
758 #if __CUDA_ARCH__ >= 300
759 asm(
"vset4.u32.u32.ne %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
775 static __device__ __forceinline__
unsigned int vcmpne4(
unsigned int a,
unsigned int b)
779 #if __CUDA_ARCH__ >= 300
799 static __device__ __forceinline__
unsigned int vabsdiff4(
unsigned int a,
unsigned int b)
803 #if __CUDA_ARCH__ >= 300
804 asm(
"vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
805 #elif __CUDA_ARCH__ >= 200
806 asm(
"vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
807 asm(
"vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
808 asm(
"vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
809 asm(
"vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
822 static __device__ __forceinline__
unsigned int vmax4(
unsigned int a,
unsigned int b)
826 #if __CUDA_ARCH__ >= 300
827 asm(
"vmax4.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
828 #elif __CUDA_ARCH__ >= 200
829 asm(
"vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
830 asm(
"vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
831 asm(
"vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
832 asm(
"vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
844 static __device__ __forceinline__
unsigned int vmin4(
unsigned int a,
unsigned int b)
848 #if __CUDA_ARCH__ >= 300
849 asm(
"vmin4.u32.u32.u32 %0, %1, %2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
850 #elif __CUDA_ARCH__ >= 200
851 asm(
"vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
852 asm(
"vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
853 asm(
"vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
854 asm(
"vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" :
"=r"(
r) :
"r"(a),
"r"(b),
"r"(
r));
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
CvRect r
Definition: imgproc_c.h:984
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441