43 #ifndef OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
44 #define OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
46 #include <thrust/tuple.h>
47 #include "../warp.hpp"
48 #include "../warp_shuffle.hpp"
52 namespace cv {
namespace cuda {
namespace device
54 namespace reduce_key_val_detail
56 template <
typename T>
struct GetType;
57 template <
typename T>
struct GetType<
T*>
61 template <
typename T>
struct GetType<volatile
T*>
65 template <
typename T>
struct GetType<
T&>
70 template <
unsigned int I,
unsigned int N>
73 template <
class Po
interTuple,
class ReferenceTuple>
74 static __device__
void loadToSmem(
const PointerTuple& smem,
const ReferenceTuple&
data,
unsigned int tid)
76 thrust::get<I>(smem)[tid] = thrust::get<I>(
data);
78 For<I + 1, N>::loadToSmem(smem,
data, tid);
80 template <
class Po
interTuple,
class ReferenceTuple>
81 static __device__
void loadFromSmem(
const PointerTuple& smem,
const ReferenceTuple&
data,
unsigned int tid)
83 thrust::get<I>(
data) = thrust::get<I>(smem)[tid];
85 For<I + 1, N>::loadFromSmem(smem,
data, tid);
88 template <
class ReferenceTuple>
89 static __device__
void copyShfl(
const ReferenceTuple& val,
unsigned int delta,
int width)
91 thrust::get<I>(val) = shfl_down(thrust::get<I>(val),
delta, width);
93 For<I + 1, N>::copyShfl(val,
delta, width);
95 template <
class Po
interTuple,
class ReferenceTuple>
96 static __device__
void copy(
const PointerTuple& svals,
const ReferenceTuple& val,
unsigned int tid,
unsigned int delta)
98 thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid +
delta];
100 For<I + 1, N>::copy(svals, val, tid,
delta);
103 template <
class KeyReferenceTuple,
class ValReferenceTuple,
class CmpTuple>
104 static __device__
void mergeShfl(
const KeyReferenceTuple& key,
const ValReferenceTuple& val,
const CmpTuple& cmp,
unsigned int delta,
int width)
108 if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
110 thrust::get<I>(key) = reg;
111 thrust::get<I>(val) = shfl_down(thrust::get<I>(val),
delta, width);
114 For<I + 1, N>::mergeShfl(key, val, cmp,
delta, width);
116 template <
class KeyPo
interTuple,
class KeyReferenceTuple,
class ValPo
interTuple,
class ValReferenceTuple,
class CmpTuple>
117 static __device__
void merge(
const KeyPointerTuple& skeys,
const KeyReferenceTuple& key,
118 const ValPointerTuple& svals,
const ValReferenceTuple& val,
120 unsigned int tid,
unsigned int delta)
124 if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
126 thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
127 thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid +
delta];
133 template <
unsigned int N>
136 template <
class Po
interTuple,
class ReferenceTuple>
137 static __device__
void loadToSmem(
const PointerTuple&,
const ReferenceTuple&,
unsigned int)
140 template <
class Po
interTuple,
class ReferenceTuple>
141 static __device__
void loadFromSmem(
const PointerTuple&,
const ReferenceTuple&,
unsigned int)
145 template <
class ReferenceTuple>
146 static __device__
void copyShfl(
const ReferenceTuple&,
unsigned int,
int)
149 template <
class Po
interTuple,
class ReferenceTuple>
150 static __device__
void copy(
const PointerTuple&,
const ReferenceTuple&,
unsigned int,
unsigned int)
154 template <
class KeyReferenceTuple,
class ValReferenceTuple,
class CmpTuple>
155 static __device__
void mergeShfl(
const KeyReferenceTuple&,
const ValReferenceTuple&,
const CmpTuple&,
unsigned int,
int)
158 template <
class KeyPo
interTuple,
class KeyReferenceTuple,
class ValPo
interTuple,
class ValReferenceTuple,
class CmpTuple>
159 static __device__
void merge(
const KeyPointerTuple&,
const KeyReferenceTuple&,
160 const ValPointerTuple&,
const ValReferenceTuple&,
162 unsigned int,
unsigned int)
170 template <
typename T>
171 __device__ __forceinline__
void loadToSmem(
volatile T* smem,
T&
data,
unsigned int tid)
175 template <
typename T>
176 __device__ __forceinline__
void loadFromSmem(
volatile T* smem,
T&
data,
unsigned int tid)
181 #if (CUDART_VERSION < 12040)
182 template <
typename VP0,
typename VP1,
typename VP2,
typename VP3,
typename VP4,
typename VP5,
typename VP6,
typename VP7,
typename VP8,
typename VP9,
183 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9>
184 __device__ __forceinline__
void loadToSmem(
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
185 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&
data,
188 For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >
::value>::loadToSmem(smem,
data, tid);
190 template <
typename VP0,
typename VP1,
typename VP2,
typename VP3,
typename VP4,
typename VP5,
typename VP6,
typename VP7,
typename VP8,
typename VP9,
191 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9>
192 __device__ __forceinline__
void loadFromSmem(
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
193 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&
data,
196 For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >
::value>::loadFromSmem(smem,
data, tid);
199 template <
typename... VP,
typename... VR>
200 __device__ __forceinline__
void loadToSmem(
const thrust::tuple<VP...>& smem,
const thrust::tuple<VR...>&
data,
unsigned int tid)
202 For<0, thrust::tuple_size<thrust::tuple<VP...> >
::value>::loadToSmem(smem,
data, tid);
204 template <
typename... VP,
typename... VR>
205 __device__ __forceinline__
void loadFromSmem(
const thrust::tuple<VP...>& smem,
const thrust::tuple<VR...>&
data,
unsigned int tid)
207 For<0, thrust::tuple_size<thrust::tuple<VP...> >
::value>::loadFromSmem(smem,
data, tid);
211 template <
typename V>
212 __device__ __forceinline__
void copyValsShfl(
V& val,
unsigned int delta,
int width)
214 val = shfl_down(val,
delta, width);
216 template <
typename V>
217 __device__ __forceinline__
void copyVals(
volatile V* svals,
V& val,
unsigned int tid,
unsigned int delta)
219 svals[tid] = val = svals[tid +
delta];
222 template <
typename K,
typename V,
class Cmp>
223 __device__ __forceinline__
void mergeShfl(K& key,
V& val,
const Cmp& cmp,
unsigned int delta,
int width)
225 K reg = shfl_down(key,
delta, width);
230 copyValsShfl(val,
delta, width);
233 template <
typename K,
typename V,
class Cmp>
234 __device__ __forceinline__
void merge(
volatile K* skeys, K& key,
volatile V* svals,
V& val,
const Cmp& cmp,
unsigned int tid,
unsigned int delta)
236 K reg = skeys[tid +
delta];
240 skeys[tid] = key = reg;
241 copyVals(svals, val, tid,
delta);
245 #if (CUDART_VERSION < 12040)
246 template <
typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9>
247 __device__ __forceinline__
void copyValsShfl(
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
251 For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >
::value>::copyShfl(val,
delta, width);
253 template <
typename VP0,
typename VP1,
typename VP2,
typename VP3,
typename VP4,
typename VP5,
typename VP6,
typename VP7,
typename VP8,
typename VP9,
254 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9>
255 __device__ __forceinline__
void copyVals(
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
256 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
257 unsigned int tid,
unsigned int delta)
259 For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >
::value>::copy(svals, val, tid,
delta);
262 template <
typename K,
263 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9,
265 __device__ __forceinline__
void mergeShfl(K& key,
266 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
268 unsigned int delta,
int width)
270 K reg = shfl_down(key,
delta, width);
275 copyValsShfl(val,
delta, width);
278 template <
typename K,
279 typename VP0,
typename VP1,
typename VP2,
typename VP3,
typename VP4,
typename VP5,
typename VP6,
typename VP7,
typename VP8,
typename VP9,
280 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9,
282 __device__ __forceinline__
void merge(
volatile K* skeys, K& key,
283 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
284 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
285 const Cmp& cmp,
unsigned int tid,
unsigned int delta)
287 K reg = skeys[tid +
delta];
291 skeys[tid] = key = reg;
292 copyVals(svals, val, tid,
delta);
295 template <
typename KR0,
typename KR1,
typename KR2,
typename KR3,
typename KR4,
typename KR5,
typename KR6,
typename KR7,
typename KR8,
typename KR9,
296 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9,
297 class Cmp0,
class Cmp1,
class Cmp2,
class Cmp3,
class Cmp4,
class Cmp5,
class Cmp6,
class Cmp7,
class Cmp8,
class Cmp9>
298 __device__ __forceinline__
void mergeShfl(
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
299 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
300 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
301 unsigned int delta,
int width)
303 For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >
::value>::mergeShfl(key, val, cmp,
delta, width);
305 template <
typename KP0,
typename KP1,
typename KP2,
typename KP3,
typename KP4,
typename KP5,
typename KP6,
typename KP7,
typename KP8,
typename KP9,
306 typename KR0,
typename KR1,
typename KR2,
typename KR3,
typename KR4,
typename KR5,
typename KR6,
typename KR7,
typename KR8,
typename KR9,
307 typename VP0,
typename VP1,
typename VP2,
typename VP3,
typename VP4,
typename VP5,
typename VP6,
typename VP7,
typename VP8,
typename VP9,
308 typename VR0,
typename VR1,
typename VR2,
typename VR3,
typename VR4,
typename VR5,
typename VR6,
typename VR7,
typename VR8,
typename VR9,
309 class Cmp0,
class Cmp1,
class Cmp2,
class Cmp3,
class Cmp4,
class Cmp5,
class Cmp6,
class Cmp7,
class Cmp8,
class Cmp9>
310 __device__ __forceinline__
void merge(
const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
311 const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
312 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
313 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
314 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
315 unsigned int tid,
unsigned int delta)
317 For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >
::value>
::merge(skeys, key, svals, val, cmp, tid,
delta);
320 template <
typename... VR>
321 __device__ __forceinline__
void copyValsShfl(
const thrust::tuple<VR...>& val,
unsigned int delta,
int width)
323 For<0, thrust::tuple_size<thrust::tuple<VR...> >
::value>::copyShfl(val,
delta, width);
325 template <
typename... VP,
typename... VR>
326 __device__ __forceinline__
void copyVals(
const thrust::tuple<VP...>& svals,
const thrust::tuple<VR...>& val,
unsigned int tid,
unsigned int delta)
328 For<0, thrust::tuple_size<thrust::tuple<VP...> >
::value>::copy(svals, val, tid,
delta);
331 template <
typename K,
typename... VR,
class Cmp>
332 __device__ __forceinline__
void mergeShfl(K& key,
const thrust::tuple<VR...>& val,
const Cmp& cmp,
unsigned int delta,
int width)
334 K reg = shfl_down(key,
delta, width);
339 copyValsShfl(val,
delta, width);
342 template <
typename K,
typename... VP,
typename... VR,
class Cmp>
343 __device__ __forceinline__
void merge(
volatile K* skeys, K& key,
const thrust::tuple<VP...>& svals,
344 const thrust::tuple<VR...>& val,
const Cmp& cmp,
unsigned int tid,
unsigned int delta)
346 K reg = skeys[tid +
delta];
350 skeys[tid] = key = reg;
351 copyVals(svals, val, tid,
delta);
354 template <
typename... KR,
typename... VR,
class... Cmp>
355 __device__ __forceinline__
void mergeShfl(
const thrust::tuple<KR...>& key,
356 const thrust::tuple<VR...>& val,
357 const thrust::tuple<Cmp...>& cmp,
358 unsigned int delta,
int width)
360 For<0, thrust::tuple_size<thrust::tuple<KR...> >
::value>::mergeShfl(key, val, cmp,
delta, width);
362 template <
typename... KP,
typename... KR,
typename... VP,
typename... VR,
class... Cmp>
363 __device__ __forceinline__
void merge(
const thrust::tuple<KP...>& skeys,
364 const thrust::tuple<KR...>& key,
365 const thrust::tuple<VP...>& svals,
366 const thrust::tuple<VR...>& val,
367 const thrust::tuple<Cmp...>& cmp,
368 unsigned int tid,
unsigned int delta)
370 For<0, thrust::tuple_size<thrust::tuple<VP...> >
::value>
::merge(skeys, key, svals, val, cmp, tid,
delta);
377 template <
unsigned int N>
struct Generic
379 template <
class KP,
class KR,
class VP,
class VR,
class Cmp>
380 static __device__
void reduce(KP skeys, KR key, VP svals, VR val,
unsigned int tid, Cmp cmp)
382 loadToSmem(skeys, key, tid);
383 loadValsToSmem(svals, val, tid);
390 merge(skeys, key, svals, val, cmp, tid, 1024);
397 merge(skeys, key, svals, val, cmp, tid, 512);
404 merge(skeys, key, svals, val, cmp, tid, 256);
411 merge(skeys, key, svals, val, cmp, tid, 128);
418 merge(skeys, key, svals, val, cmp, tid, 64);
425 merge(skeys, key, svals, val, cmp, tid, 32);
430 merge(skeys, key, svals, val, cmp, tid, 16);
431 merge(skeys, key, svals, val, cmp, tid, 8);
432 merge(skeys, key, svals, val, cmp, tid, 4);
433 merge(skeys, key, svals, val, cmp, tid, 2);
434 merge(skeys, key, svals, val, cmp, tid, 1);
439 template <
unsigned int I,
class KP,
class KR,
class VP,
class VR,
class Cmp>
442 static __device__
void loopShfl(KR key, VR val, Cmp cmp,
unsigned int N)
444 mergeShfl(key, val, cmp, I, N);
445 Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
447 static __device__
void loop(KP skeys, KR key, VP svals, VR val,
unsigned int tid, Cmp cmp)
449 merge(skeys, key, svals, val, cmp, tid, I);
450 Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
453 template <
class KP,
class KR,
class VP,
class VR,
class Cmp>
454 struct Unroll<0, KP, KR, VP, VR, Cmp>
456 static __device__
void loopShfl(KR, VR, Cmp,
unsigned int)
459 static __device__
void loop(KP, KR, VP, VR,
unsigned int, Cmp)
464 template <
unsigned int N>
struct WarpOptimized
466 template <
class KP,
class KR,
class VP,
class VR,
class Cmp>
467 static __device__
void reduce(KP skeys, KR key, VP svals, VR val,
unsigned int tid, Cmp cmp)
474 Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
476 loadToSmem(skeys, key, tid);
477 loadToSmem(svals, val, tid);
480 Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
485 template <
unsigned int N>
struct GenericOptimized32
489 template <
class KP,
class KR,
class VP,
class VR,
class Cmp>
490 static __device__
void reduce(KP skeys, KR key, VP svals, VR val,
unsigned int tid, Cmp cmp)
492 const unsigned int laneId = Warp::laneId();
495 Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
499 loadToSmem(skeys, key, tid / 32);
500 loadToSmem(svals, val, tid / 32);
503 loadToSmem(skeys, key, tid);
504 loadToSmem(svals, val, tid);
507 Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
513 loadToSmem(skeys, key, tid / 32);
514 loadToSmem(svals, val, tid / 32);
520 loadFromSmem(skeys, key, tid);
525 loadFromSmem(svals, val, tid);
527 Unroll<
M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp,
M);
529 Unroll<
M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
535 template <
bool val,
class T1,
class T2>
struct StaticIf;
536 template <
class T1,
class T2>
struct StaticIf<true, T1, T2>
540 template <
class T1,
class T2>
struct StaticIf<false, T1, T2>
545 template <
unsigned int N>
struct IsPowerOf2
547 enum {
value = ((N != 0) && !(N & (N - 1))) };
550 template <
unsigned int N>
struct Dispatcher
552 typedef typename StaticIf<
557 GenericOptimized32<N>,
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray T
Definition: calib3d.hpp:1867
CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype=-1)
Reduces a matrix to a vector.
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
Creates one multi-channel array out of several single-channel ones.
int CvScalar value
Definition: core_c.h:720
int int type
Definition: core_c.h:221
void * data
Definition: core_c.h:427
const CvArr const CvArr * V
Definition: core_c.h:1341
CvArr CvPoint2D32f double M
Definition: imgproc_c.h:270
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441