EstervQrCode 1.1.1
Library for qr code manipulation
reduce_key_val.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #ifndef OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
44 #define OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
45 
46 #include <thrust/tuple.h>
47 #include "../warp.hpp"
48 #include "../warp_shuffle.hpp"
49 
51 
52 namespace cv { namespace cuda { namespace device
53 {
54  namespace reduce_key_val_detail
55  {
56  template <typename T> struct GetType;
57  template <typename T> struct GetType<T*>
58  {
59  typedef T type;
60  };
61  template <typename T> struct GetType<volatile T*>
62  {
63  typedef T type;
64  };
65  template <typename T> struct GetType<T&>
66  {
67  typedef T type;
68  };
69 
70  template <unsigned int I, unsigned int N>
71  struct For
72  {
73  template <class PointerTuple, class ReferenceTuple>
74  static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
75  {
76  thrust::get<I>(smem)[tid] = thrust::get<I>(data);
77 
78  For<I + 1, N>::loadToSmem(smem, data, tid);
79  }
80  template <class PointerTuple, class ReferenceTuple>
81  static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
82  {
83  thrust::get<I>(data) = thrust::get<I>(smem)[tid];
84 
85  For<I + 1, N>::loadFromSmem(smem, data, tid);
86  }
87 
88  template <class ReferenceTuple>
89  static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
90  {
91  thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
92 
93  For<I + 1, N>::copyShfl(val, delta, width);
94  }
95  template <class PointerTuple, class ReferenceTuple>
96  static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
97  {
98  thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
99 
100  For<I + 1, N>::copy(svals, val, tid, delta);
101  }
102 
103  template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
104  static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
105  {
106  typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
107 
108  if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
109  {
110  thrust::get<I>(key) = reg;
111  thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
112  }
113 
114  For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
115  }
116  template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
117  static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
118  const ValPointerTuple& svals, const ValReferenceTuple& val,
119  const CmpTuple& cmp,
120  unsigned int tid, unsigned int delta)
121  {
122  typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
123 
124  if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
125  {
126  thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
127  thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
128  }
129 
130  For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
131  }
132  };
133  template <unsigned int N>
134  struct For<N, N>
135  {
136  template <class PointerTuple, class ReferenceTuple>
137  static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
138  {
139  }
140  template <class PointerTuple, class ReferenceTuple>
141  static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
142  {
143  }
144 
145  template <class ReferenceTuple>
146  static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
147  {
148  }
149  template <class PointerTuple, class ReferenceTuple>
150  static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
151  {
152  }
153 
154  template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
155  static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
156  {
157  }
158  template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
159  static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
160  const ValPointerTuple&, const ValReferenceTuple&,
161  const CmpTuple&,
162  unsigned int, unsigned int)
163  {
164  }
165  };
166 
168  // loadToSmem
169 
170  template <typename T>
171  __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
172  {
173  smem[tid] = data;
174  }
175  template <typename T>
176  __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
177  {
178  data = smem[tid];
179  }
180 
181 #if (CUDART_VERSION < 12040)
182  template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
183  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
184  __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
185  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
186  unsigned int tid)
187  {
188  For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
189  }
190  template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
191  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
192  __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
193  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
194  unsigned int tid)
195  {
196  For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
197  }
198 #else
199  template <typename... VP, typename... VR>
200  __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP...>& smem, const thrust::tuple<VR...>& data, unsigned int tid)
201  {
202  For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::loadToSmem(smem, data, tid);
203  }
204  template <typename... VP, typename... VR>
205  __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP...>& smem, const thrust::tuple<VR...>& data, unsigned int tid)
206  {
207  For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::loadFromSmem(smem, data, tid);
208  }
209 #endif
210 
211  template <typename V>
212  __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
213  {
214  val = shfl_down(val, delta, width);
215  }
216  template <typename V>
217  __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
218  {
219  svals[tid] = val = svals[tid + delta];
220  }
221 
222  template <typename K, typename V, class Cmp>
223  __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
224  {
225  K reg = shfl_down(key, delta, width);
226 
227  if (cmp(reg, key))
228  {
229  key = reg;
230  copyValsShfl(val, delta, width);
231  }
232  }
233  template <typename K, typename V, class Cmp>
234  __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
235  {
236  K reg = skeys[tid + delta];
237 
238  if (cmp(reg, key))
239  {
240  skeys[tid] = key = reg;
241  copyVals(svals, val, tid, delta);
242  }
243  }
244 
245 #if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
246  template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
247  __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
248  unsigned int delta,
249  int width)
250  {
251  For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
252  }
253  template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
254  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
255  __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
256  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
257  unsigned int tid, unsigned int delta)
258  {
259  For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
260  }
261 
262  template <typename K,
263  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
264  class Cmp>
265  __device__ __forceinline__ void mergeShfl(K& key,
266  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
267  const Cmp& cmp,
268  unsigned int delta, int width)
269  {
270  K reg = shfl_down(key, delta, width);
271 
272  if (cmp(reg, key))
273  {
274  key = reg;
275  copyValsShfl(val, delta, width);
276  }
277  }
278  template <typename K,
279  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
280  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
281  class Cmp>
282  __device__ __forceinline__ void merge(volatile K* skeys, K& key,
283  const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
284  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
285  const Cmp& cmp, unsigned int tid, unsigned int delta)
286  {
287  K reg = skeys[tid + delta];
288 
289  if (cmp(reg, key))
290  {
291  skeys[tid] = key = reg;
292  copyVals(svals, val, tid, delta);
293  }
294  }
295  template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
296  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
297  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
298  __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
299  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
300  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
301  unsigned int delta, int width)
302  {
303  For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
304  }
305  template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
306  typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
307  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
308  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
309  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
310  __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
311  const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
312  const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
313  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
314  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
315  unsigned int tid, unsigned int delta)
316  {
317  For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
318  }
319 #else
320  template <typename... VR>
321  __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR...>& val, unsigned int delta, int width)
322  {
323  For<0, thrust::tuple_size<thrust::tuple<VR...> >::value>::copyShfl(val, delta, width);
324  }
325  template <typename... VP, typename... VR>
326  __device__ __forceinline__ void copyVals(const thrust::tuple<VP...>& svals, const thrust::tuple<VR...>& val, unsigned int tid, unsigned int delta)
327  {
328  For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::copy(svals, val, tid, delta);
329  }
330 
331  template <typename K, typename... VR, class Cmp>
332  __device__ __forceinline__ void mergeShfl(K& key, const thrust::tuple<VR...>& val, const Cmp& cmp, unsigned int delta, int width)
333  {
334  K reg = shfl_down(key, delta, width);
335 
336  if (cmp(reg, key))
337  {
338  key = reg;
339  copyValsShfl(val, delta, width);
340  }
341  }
342  template <typename K, typename... VP, typename... VR, class Cmp>
343  __device__ __forceinline__ void merge(volatile K* skeys, K& key, const thrust::tuple<VP...>& svals,
344  const thrust::tuple<VR...>& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
345  {
346  K reg = skeys[tid + delta];
347 
348  if (cmp(reg, key))
349  {
350  skeys[tid] = key = reg;
351  copyVals(svals, val, tid, delta);
352  }
353  }
354  template <typename... KR, typename... VR, class... Cmp>
355  __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR...>& key,
356  const thrust::tuple<VR...>& val,
357  const thrust::tuple<Cmp...>& cmp,
358  unsigned int delta, int width)
359  {
360  For<0, thrust::tuple_size<thrust::tuple<KR...> >::value>::mergeShfl(key, val, cmp, delta, width);
361  }
362  template <typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
363  __device__ __forceinline__ void merge(const thrust::tuple<KP...>& skeys,
364  const thrust::tuple<KR...>& key,
365  const thrust::tuple<VP...>& svals,
366  const thrust::tuple<VR...>& val,
367  const thrust::tuple<Cmp...>& cmp,
368  unsigned int tid, unsigned int delta)
369  {
370  For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
371  }
372 
373 #endif
375  // Generic
376 
377  template <unsigned int N> struct Generic
378  {
379  template <class KP, class KR, class VP, class VR, class Cmp>
380  static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
381  {
382  loadToSmem(skeys, key, tid);
383  loadValsToSmem(svals, val, tid);
384  if (N >= 32)
385  __syncthreads();
386 
387  if (N >= 2048)
388  {
389  if (tid < 1024)
390  merge(skeys, key, svals, val, cmp, tid, 1024);
391 
392  __syncthreads();
393  }
394  if (N >= 1024)
395  {
396  if (tid < 512)
397  merge(skeys, key, svals, val, cmp, tid, 512);
398 
399  __syncthreads();
400  }
401  if (N >= 512)
402  {
403  if (tid < 256)
404  merge(skeys, key, svals, val, cmp, tid, 256);
405 
406  __syncthreads();
407  }
408  if (N >= 256)
409  {
410  if (tid < 128)
411  merge(skeys, key, svals, val, cmp, tid, 128);
412 
413  __syncthreads();
414  }
415  if (N >= 128)
416  {
417  if (tid < 64)
418  merge(skeys, key, svals, val, cmp, tid, 64);
419 
420  __syncthreads();
421  }
422  if (N >= 64)
423  {
424  if (tid < 32)
425  merge(skeys, key, svals, val, cmp, tid, 32);
426  }
427 
428  if (tid < 16)
429  {
430  merge(skeys, key, svals, val, cmp, tid, 16);
431  merge(skeys, key, svals, val, cmp, tid, 8);
432  merge(skeys, key, svals, val, cmp, tid, 4);
433  merge(skeys, key, svals, val, cmp, tid, 2);
434  merge(skeys, key, svals, val, cmp, tid, 1);
435  }
436  }
437  };
438 
439  template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
440  struct Unroll
441  {
442  static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
443  {
444  mergeShfl(key, val, cmp, I, N);
445  Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
446  }
447  static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
448  {
449  merge(skeys, key, svals, val, cmp, tid, I);
450  Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
451  }
452  };
453  template <class KP, class KR, class VP, class VR, class Cmp>
454  struct Unroll<0, KP, KR, VP, VR, Cmp>
455  {
456  static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
457  {
458  }
459  static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
460  {
461  }
462  };
463 
464  template <unsigned int N> struct WarpOptimized
465  {
466  template <class KP, class KR, class VP, class VR, class Cmp>
467  static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
468  {
469  #if 0 // __CUDA_ARCH__ >= 300
470  CV_UNUSED(skeys);
471  CV_UNUSED(svals);
472  CV_UNUSED(tid);
473 
474  Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
475  #else
476  loadToSmem(skeys, key, tid);
477  loadToSmem(svals, val, tid);
478 
479  if (tid < N / 2)
480  Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
481  #endif
482  }
483  };
484 
485  template <unsigned int N> struct GenericOptimized32
486  {
487  enum { M = N / 32 };
488 
489  template <class KP, class KR, class VP, class VR, class Cmp>
490  static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
491  {
492  const unsigned int laneId = Warp::laneId();
493 
494  #if 0 // __CUDA_ARCH__ >= 300
495  Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
496 
497  if (laneId == 0)
498  {
499  loadToSmem(skeys, key, tid / 32);
500  loadToSmem(svals, val, tid / 32);
501  }
502  #else
503  loadToSmem(skeys, key, tid);
504  loadToSmem(svals, val, tid);
505 
506  if (laneId < 16)
507  Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
508 
509  __syncthreads();
510 
511  if (laneId == 0)
512  {
513  loadToSmem(skeys, key, tid / 32);
514  loadToSmem(svals, val, tid / 32);
515  }
516  #endif
517 
518  __syncthreads();
519 
520  loadFromSmem(skeys, key, tid);
521 
522  if (tid < 32)
523  {
524  #if 0 // __CUDA_ARCH__ >= 300
525  loadFromSmem(svals, val, tid);
526 
527  Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
528  #else
529  Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
530  #endif
531  }
532  }
533  };
534 
535  template <bool val, class T1, class T2> struct StaticIf;
536  template <class T1, class T2> struct StaticIf<true, T1, T2>
537  {
538  typedef T1 type;
539  };
540  template <class T1, class T2> struct StaticIf<false, T1, T2>
541  {
542  typedef T2 type;
543  };
544 
545  template <unsigned int N> struct IsPowerOf2
546  {
547  enum { value = ((N != 0) && !(N & (N - 1))) };
548  };
549 
550  template <unsigned int N> struct Dispatcher
551  {
552  typedef typename StaticIf<
554  WarpOptimized<N>,
555  typename StaticIf<
557  GenericOptimized32<N>,
558  Generic<N>
559  >::type
560  >::type reductor;
561  };
562  }
563 }}}
564 
566 
567 #endif // OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
T copy(T... args)
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray T
Definition: calib3d.hpp:1867
CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype=-1)
Reduces a matrix to a vector.
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
Creates one multi-channel array out of several single-channel ones.
int CvScalar value
Definition: core_c.h:720
int int type
Definition: core_c.h:221
void * data
Definition: core_c.h:427
const CvArr const CvArr * V
Definition: core_c.h:1341
CvArr CvPoint2D32f double M
Definition: imgproc_c.h:270
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441