43 #ifndef OPENCV_CUDA_DEVICE_BLOCK_HPP
44 #define OPENCV_CUDA_DEVICE_BLOCK_HPP
52 namespace cv {
namespace cuda {
namespace device
56 static __device__ __forceinline__
unsigned int id()
61 static __device__ __forceinline__
unsigned int stride()
63 return blockDim.x * blockDim.y * blockDim.z;
66 static __device__ __forceinline__
void sync()
71 static __device__ __forceinline__
int flattenedThreadId()
73 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
76 template<
typename It,
typename T>
77 static __device__ __forceinline__
void fill(It beg, It
end,
const T&
value)
79 int STRIDE = stride();
80 It t = beg + flattenedThreadId();
82 for(; t <
end; t += STRIDE)
86 template<
typename OutIt,
typename T>
87 static __device__ __forceinline__
void yota(OutIt beg, OutIt
end,
T value)
89 int STRIDE = stride();
90 int tid = flattenedThreadId();
93 for(OutIt t = beg + tid; t <
end; t += STRIDE,
value += STRIDE)
97 template<
typename InIt,
typename OutIt>
98 static __device__ __forceinline__
void copy(InIt beg, InIt
end, OutIt out)
100 int STRIDE = stride();
101 InIt t = beg + flattenedThreadId();
102 OutIt o = out + (t - beg);
104 for(; t <
end; t += STRIDE, o += STRIDE)
108 template<
typename InIt,
typename OutIt,
class UnOp>
109 static __device__ __forceinline__
void transform(InIt beg, InIt
end, OutIt out, UnOp op)
111 int STRIDE = stride();
112 InIt t = beg + flattenedThreadId();
113 OutIt o = out + (t - beg);
115 for(; t <
end; t += STRIDE, o += STRIDE)
119 template<
typename InIt1,
typename InIt2,
typename OutIt,
class BinOp>
120 static __device__ __forceinline__
void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
122 int STRIDE = stride();
123 InIt1 t1 = beg1 + flattenedThreadId();
124 InIt2 t2 = beg2 + flattenedThreadId();
125 OutIt o = out + (t1 - beg1);
127 for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
131 template<
int CTA_SIZE,
typename T,
class BinOp>
132 static __device__ __forceinline__
void reduce(
volatile T*
buffer, BinOp op)
134 int tid = flattenedThreadId();
137 if (CTA_SIZE >= 1024) {
if (tid < 512)
buffer[tid] = val = op(val,
buffer[tid + 512]); __syncthreads(); }
138 if (CTA_SIZE >= 512) {
if (tid < 256)
buffer[tid] = val = op(val,
buffer[tid + 256]); __syncthreads(); }
139 if (CTA_SIZE >= 256) {
if (tid < 128)
buffer[tid] = val = op(val,
buffer[tid + 128]); __syncthreads(); }
140 if (CTA_SIZE >= 128) {
if (tid < 64)
buffer[tid] = val = op(val,
buffer[tid + 64]); __syncthreads(); }
144 if (CTA_SIZE >= 64) {
buffer[tid] = val = op(val,
buffer[tid + 32]); }
145 if (CTA_SIZE >= 32) {
buffer[tid] = val = op(val,
buffer[tid + 16]); }
146 if (CTA_SIZE >= 16) {
buffer[tid] = val = op(val,
buffer[tid + 8]); }
147 if (CTA_SIZE >= 8) {
buffer[tid] = val = op(val,
buffer[tid + 4]); }
148 if (CTA_SIZE >= 4) {
buffer[tid] = val = op(val,
buffer[tid + 2]); }
149 if (CTA_SIZE >= 2) {
buffer[tid] = val = op(val,
buffer[tid + 1]); }
153 template<
int CTA_SIZE,
typename T,
class BinOp>
154 static __device__ __forceinline__
T reduce(
volatile T*
buffer,
T init, BinOp op)
156 int tid = flattenedThreadId();
160 if (CTA_SIZE >= 1024) {
if (tid < 512)
buffer[tid] = val = op(val,
buffer[tid + 512]); __syncthreads(); }
161 if (CTA_SIZE >= 512) {
if (tid < 256)
buffer[tid] = val = op(val,
buffer[tid + 256]); __syncthreads(); }
162 if (CTA_SIZE >= 256) {
if (tid < 128)
buffer[tid] = val = op(val,
buffer[tid + 128]); __syncthreads(); }
163 if (CTA_SIZE >= 128) {
if (tid < 64)
buffer[tid] = val = op(val,
buffer[tid + 64]); __syncthreads(); }
167 if (CTA_SIZE >= 64) {
buffer[tid] = val = op(val,
buffer[tid + 32]); }
168 if (CTA_SIZE >= 32) {
buffer[tid] = val = op(val,
buffer[tid + 16]); }
169 if (CTA_SIZE >= 16) {
buffer[tid] = val = op(val,
buffer[tid + 8]); }
170 if (CTA_SIZE >= 8) {
buffer[tid] = val = op(val,
buffer[tid + 4]); }
171 if (CTA_SIZE >= 4) {
buffer[tid] = val = op(val,
buffer[tid + 2]); }
172 if (CTA_SIZE >= 2) {
buffer[tid] = val = op(val,
buffer[tid + 1]); }
178 template <
typename T,
class BinOp>
179 static __device__ __forceinline__
void reduce_n(
T*
data,
unsigned int n, BinOp op)
181 int ftid = flattenedThreadId();
186 for (
unsigned int i = sft + ftid; i < n; i += sft)
196 unsigned int half = n/2;
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray T
Definition: calib3d.hpp:1867
CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype=-1)
Reduces a matrix to a vector.
CV_EXPORTS_W void transform(InputArray src, OutputArray dst, InputArray m)
Performs the matrix transformation of every array element.
int CvScalar value
Definition: core_c.h:720
double double end
Definition: core_c.h:1381
void * data
Definition: core_c.h:427
CvPoint CvPoint void * buffer
Definition: imgproc_c.h:357
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441