43 #ifndef OPENCV_CUDA_SCAN_HPP
44 #define OPENCV_CUDA_SCAN_HPP
46 #include "opencv2/core/cuda/common.hpp"
47 #include "opencv2/core/cuda/utility.hpp"
48 #include "opencv2/core/cuda/warp.hpp"
49 #include "opencv2/core/cuda/warp_shuffle.hpp"
57 namespace cv {
namespace cuda {
namespace device
59 enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 };
61 template <ScanKind Kind,
typename T,
typename F>
struct WarpScan
63 __device__ __forceinline__ WarpScan() {}
64 __device__ __forceinline__ WarpScan(
const WarpScan& other) { CV_UNUSED(other); }
66 __device__ __forceinline__
T operator()(
volatile T *ptr ,
const unsigned int idx)
68 const unsigned int lane =
idx & 31;
71 if ( lane >= 1) ptr [
idx ] = op(ptr [
idx - 1], ptr [
idx]);
72 if ( lane >= 2) ptr [
idx ] = op(ptr [
idx - 2], ptr [
idx]);
73 if ( lane >= 4) ptr [
idx ] = op(ptr [
idx - 4], ptr [
idx]);
74 if ( lane >= 8) ptr [
idx ] = op(ptr [
idx - 8], ptr [
idx]);
75 if ( lane >= 16) ptr [
idx ] = op(ptr [
idx - 16], ptr [
idx]);
77 if( Kind == INCLUSIVE )
80 return (lane > 0) ? ptr [
idx - 1] : 0;
83 __device__ __forceinline__
unsigned int index(
const unsigned int tid)
88 __device__ __forceinline__
void init(
volatile T *ptr){}
90 static const int warp_offset = 0;
92 typedef WarpScan<INCLUSIVE, T, F>
merge;
95 template <ScanKind Kind ,
typename T,
typename F>
struct WarpScanNoComp
97 __device__ __forceinline__ WarpScanNoComp() {}
98 __device__ __forceinline__ WarpScanNoComp(
const WarpScanNoComp& other) { CV_UNUSED(other); }
100 __device__ __forceinline__
T operator()(
volatile T *ptr ,
const unsigned int idx)
102 const unsigned int lane = threadIdx.x & 31;
105 ptr [
idx ] = op(ptr [
idx - 1], ptr [
idx]);
106 ptr [
idx ] = op(ptr [
idx - 2], ptr [
idx]);
107 ptr [
idx ] = op(ptr [
idx - 4], ptr [
idx]);
108 ptr [
idx ] = op(ptr [
idx - 8], ptr [
idx]);
109 ptr [
idx ] = op(ptr [
idx - 16], ptr [
idx]);
111 if( Kind == INCLUSIVE )
114 return (lane > 0) ? ptr [
idx - 1] : 0;
117 __device__ __forceinline__
unsigned int index(
const unsigned int tid)
119 return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
122 __device__ __forceinline__
void init(
volatile T *ptr)
124 ptr[threadIdx.x] = 0;
127 static const int warp_smem_stride = 32 + 16 + 1;
128 static const int warp_offset = 16;
129 static const int warp_log = 5;
130 static const int warp_mask = 31;
132 typedef WarpScanNoComp<INCLUSIVE, T, F>
merge;
135 template <ScanKind Kind ,
typename T,
typename Sc,
typename F>
struct BlockScan
137 __device__ __forceinline__ BlockScan() {}
138 __device__ __forceinline__ BlockScan(
const BlockScan& other) { CV_UNUSED(other); }
140 __device__ __forceinline__
T operator()(
volatile T *ptr)
142 const unsigned int tid = threadIdx.x;
143 const unsigned int lane = tid & warp_mask;
144 const unsigned int warp = tid >> warp_log;
148 const unsigned int idx = scan.index(tid);
150 T val = scan(ptr,
idx);
158 ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [
idx];
162 merge_scan(ptr,
idx);
166 val = ptr [scan.warp_offset + warp - 1] + val;
175 static const int warp_log = 5;
176 static const int warp_mask = 31;
179 template <
typename T>
180 __device__
T warpScanInclusive(
T idata,
volatile T* s_Data,
unsigned int tid)
182 #if __CUDA_ARCH__ >= 300
183 const unsigned int laneId = cv::cuda::device::Warp::laneId();
187 for (
int i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)
189 const T n = cv::cuda::device::shfl_up(idata, i);
196 unsigned int pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));
198 pos += OPENCV_CUDA_WARP_SIZE;
201 s_Data[
pos] += s_Data[
pos - 1];
202 s_Data[
pos] += s_Data[
pos - 2];
203 s_Data[
pos] += s_Data[
pos - 4];
204 s_Data[
pos] += s_Data[
pos - 8];
205 s_Data[
pos] += s_Data[
pos - 16];
211 template <
typename T>
212 __device__ __forceinline__
T warpScanExclusive(
T idata,
volatile T* s_Data,
unsigned int tid)
214 return warpScanInclusive(idata, s_Data, tid) - idata;
217 template <
int tiNumScanThreads,
typename T>
218 __device__
T blockScanInclusive(
T idata,
volatile T* s_Data,
unsigned int tid)
220 if (tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)
223 T warpResult = warpScanInclusive(idata, s_Data, tid);
228 if ((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))
230 s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;
236 if (tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )
241 s_Data[tid] = warpScanExclusive(val, s_Data, tid);
247 return warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];
251 return warpScanInclusive(idata, s_Data, tid);
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray T
Definition: calib3d.hpp:1867
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray OutputArray OutputArray F
Definition: calib3d.hpp:1867
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
Creates one multi-channel array out of several single-channel ones.
const int * idx
Definition: core_c.h:668
CvMemStoragePos * pos
Definition: core_c.h:1573
int index
Definition: core_c.h:634
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441