43 #ifndef OPENCV_CUDA_REDUCE_DETAIL_HPP
44 #define OPENCV_CUDA_REDUCE_DETAIL_HPP
46 #include <thrust/tuple.h>
47 #include "../warp.hpp"
48 #include "../warp_shuffle.hpp"
52 namespace cv {
namespace cuda {
namespace device
54 namespace reduce_detail
56 template <
typename T>
struct GetType;
57 template <
typename T>
struct GetType<
T*>
61 template <
typename T>
struct GetType<volatile
T*>
65 template <
typename T>
struct GetType<
T&>
70 template <
unsigned int I,
unsigned int N>
73 template <
class Po
interTuple,
class ValTuple>
74 static __device__
void loadToSmem(
const PointerTuple& smem,
const ValTuple& val,
unsigned int tid)
76 thrust::get<I>(smem)[tid] = thrust::get<I>(val);
78 For<I + 1, N>::loadToSmem(smem, val, tid);
80 template <
class Po
interTuple,
class ValTuple>
81 static __device__
void loadFromSmem(
const PointerTuple& smem,
const ValTuple& val,
unsigned int tid)
83 thrust::get<I>(val) = thrust::get<I>(smem)[tid];
85 For<I + 1, N>::loadFromSmem(smem, val, tid);
88 template <
class Po
interTuple,
class ValTuple,
class OpTuple>
89 static __device__
void merge(
const PointerTuple& smem,
const ValTuple& val,
unsigned int tid,
unsigned int delta,
const OpTuple& op)
92 thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
96 template <
class ValTuple,
class OpTuple>
97 static __device__
void mergeShfl(
const ValTuple& val,
unsigned int delta,
unsigned int width,
const OpTuple& op)
100 thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
102 For<I + 1, N>::mergeShfl(val,
delta, width, op);
105 template <
unsigned int N>
108 template <
class Po
interTuple,
class ValTuple>
109 static __device__
void loadToSmem(
const PointerTuple&,
const ValTuple&,
unsigned int)
112 template <
class Po
interTuple,
class ValTuple>
113 static __device__
void loadFromSmem(
const PointerTuple&,
const ValTuple&,
unsigned int)
117 template <
class Po
interTuple,
class ValTuple,
class OpTuple>
118 static __device__
void merge(
const PointerTuple&,
const ValTuple&,
unsigned int,
unsigned int,
const OpTuple&)
121 template <
class ValTuple,
class OpTuple>
122 static __device__
void mergeShfl(
const ValTuple&,
unsigned int,
unsigned int,
const OpTuple&)
127 template <
typename T>
128 __device__ __forceinline__
void loadToSmem(
volatile T* smem,
T& val,
unsigned int tid)
132 template <
typename T>
133 __device__ __forceinline__
void loadFromSmem(
volatile T* smem,
T& val,
unsigned int tid)
138 template <
typename T,
class Op>
139 __device__ __forceinline__
void merge(
volatile T* smem,
T& val,
unsigned int tid,
unsigned int delta,
const Op& op)
141 T reg = smem[tid +
delta];
142 smem[tid] = val = op(val, reg);
145 template <
typename T,
class Op>
146 __device__ __forceinline__
void mergeShfl(
T& val,
unsigned int delta,
unsigned int width,
const Op& op)
148 T reg = shfl_down(val,
delta, width);
152 #if (CUDART_VERSION < 12040)
153 template <
typename P0,
typename P1,
typename P2,
typename P3,
typename P4,
typename P5,
typename P6,
typename P7,
typename P8,
typename P9,
154 typename R0,
typename R1,
typename R2,
typename R3,
typename R4,
typename R5,
typename R6,
typename R7,
typename R8,
typename R9>
155 __device__ __forceinline__
void loadToSmem(
const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
156 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
159 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >
::value>::loadToSmem(smem, val, tid);
162 template <
typename P0,
typename P1,
typename P2,
typename P3,
typename P4,
typename P5,
typename P6,
typename P7,
typename P8,
typename P9,
163 typename R0,
typename R1,
typename R2,
typename R3,
typename R4,
typename R5,
typename R6,
typename R7,
typename R8,
typename R9>
164 __device__ __forceinline__
void loadFromSmem(
const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
165 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
168 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >
::value>::loadFromSmem(smem, val, tid);
171 template <
typename P0,
typename P1,
typename P2,
typename P3,
typename P4,
typename P5,
typename P6,
typename P7,
typename P8,
typename P9,
172 typename R0,
typename R1,
typename R2,
typename R3,
typename R4,
typename R5,
typename R6,
typename R7,
typename R8,
typename R9,
173 class Op0,
class Op1,
class Op2,
class Op3,
class Op4,
class Op5,
class Op6,
class Op7,
class Op8,
class Op9>
174 __device__ __forceinline__
void merge(
const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
175 const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
178 const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
180 For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >
::value>
::merge(smem, val, tid,
delta, op);
182 template <
typename R0,
typename R1,
typename R2,
typename R3,
typename R4,
typename R5,
typename R6,
typename R7,
typename R8,
typename R9,
183 class Op0,
class Op1,
class Op2,
class Op3,
class Op4,
class Op5,
class Op6,
class Op7,
class Op8,
class Op9>
184 __device__ __forceinline__
void mergeShfl(
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
187 const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
189 For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >
::value>::mergeShfl(val,
delta, width, op);
192 template <
typename... P,
typename...
R>
193 __device__ __forceinline__
void loadToSmem(
const thrust::tuple<P...>& smem,
const thrust::tuple<R...>& val,
unsigned int tid)
195 For<0, thrust::tuple_size<thrust::tuple<P...> >
::value>::loadToSmem(smem, val, tid);
198 template <
typename... P,
typename...
R>
199 __device__ __forceinline__
void loadFromSmem(
const thrust::tuple<P...>& smem,
const thrust::tuple<R...>& val,
unsigned int tid)
201 For<0, thrust::tuple_size<thrust::tuple<P...> >
::value>::loadFromSmem(smem, val, tid);
204 template <
typename... P,
typename...
R,
class... Op>
205 __device__ __forceinline__
void merge(
const thrust::tuple<P...>& smem,
const thrust::tuple<R...>& val,
unsigned int tid,
unsigned int delta,
const thrust::tuple<Op...>& op)
210 template <
typename...
R,
class... Op>
211 __device__ __forceinline__
void mergeShfl(
const thrust::tuple<R...>& val,
unsigned int delta,
unsigned int width,
const thrust::tuple<Op...>& op)
213 For<0, thrust::tuple_size<thrust::tuple<
R...> >
::value>::mergeShfl(val,
delta, width, op);
216 template <
unsigned int N>
struct Generic
218 template <
typename Po
inter,
typename Reference,
class Op>
219 static __device__
void reduce(Pointer smem, Reference val,
unsigned int tid, Op op)
221 loadToSmem(smem, val, tid);
228 merge(smem, val, tid, 1024, op);
235 merge(smem, val, tid, 512, op);
242 merge(smem, val, tid, 256, op);
249 merge(smem, val, tid, 128, op);
256 merge(smem, val, tid, 64, op);
263 merge(smem, val, tid, 32, op);
268 merge(smem, val, tid, 16, op);
269 merge(smem, val, tid, 8, op);
270 merge(smem, val, tid, 4, op);
271 merge(smem, val, tid, 2, op);
272 merge(smem, val, tid, 1, op);
277 template <
unsigned int I,
typename Po
inter,
typename Reference,
class Op>
280 static __device__
void loopShfl(Reference val, Op op,
unsigned int N)
282 mergeShfl(val, I, N, op);
283 Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
285 static __device__
void loop(Pointer smem, Reference val,
unsigned int tid, Op op)
287 merge(smem, val, tid, I, op);
288 Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
291 template <
typename Po
inter,
typename Reference,
class Op>
292 struct Unroll<0, Pointer, Reference, Op>
294 static __device__
void loopShfl(Reference, Op,
unsigned int)
297 static __device__
void loop(Pointer, Reference,
unsigned int, Op)
302 template <
unsigned int N>
struct WarpOptimized
304 template <
typename Po
inter,
typename Reference,
class Op>
305 static __device__
void reduce(Pointer smem, Reference val,
unsigned int tid, Op op)
307 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
311 Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
313 loadToSmem(smem, val, tid);
316 Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
321 template <
unsigned int N>
struct GenericOptimized32
325 template <
typename Po
inter,
typename Reference,
class Op>
326 static __device__
void reduce(Pointer smem, Reference val,
unsigned int tid, Op op)
328 const unsigned int laneId = Warp::laneId();
330 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
331 Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
334 loadToSmem(smem, val, tid / 32);
336 loadToSmem(smem, val, tid);
339 Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
344 loadToSmem(smem, val, tid / 32);
349 loadFromSmem(smem, val, tid);
353 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
354 Unroll<
M / 2, Pointer, Reference, Op>::loopShfl(val, op,
M);
356 Unroll<
M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
362 template <
bool val,
class T1,
class T2>
struct StaticIf;
363 template <
class T1,
class T2>
struct StaticIf<true, T1, T2>
367 template <
class T1,
class T2>
struct StaticIf<false, T1, T2>
372 template <
unsigned int N>
struct IsPowerOf2
374 enum {
value = ((N != 0) && !(N & (N - 1))) };
377 template <
unsigned int N>
struct Dispatcher
379 typedef typename StaticIf<
384 GenericOptimized32<N>,
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray T
Definition: calib3d.hpp:1867
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray R
Definition: calib3d.hpp:1867
CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype=-1)
Reduces a matrix to a vector.
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
Creates one multi-channel array out of several single-channel ones.
int CvScalar value
Definition: core_c.h:720
int int type
Definition: core_c.h:221
CvArr CvPoint2D32f double M
Definition: imgproc_c.h:270
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441