EstervQrCode 1.1.1
Library for qr code manipulation
reduce.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #ifndef OPENCV_CUDA_REDUCE_DETAIL_HPP
44 #define OPENCV_CUDA_REDUCE_DETAIL_HPP
45 
46 #include <thrust/tuple.h>
47 #include "../warp.hpp"
48 #include "../warp_shuffle.hpp"
49 
51 
52 namespace cv { namespace cuda { namespace device
53 {
54  namespace reduce_detail
55  {
56  template <typename T> struct GetType;
57  template <typename T> struct GetType<T*>
58  {
59  typedef T type;
60  };
61  template <typename T> struct GetType<volatile T*>
62  {
63  typedef T type;
64  };
65  template <typename T> struct GetType<T&>
66  {
67  typedef T type;
68  };
69 
70  template <unsigned int I, unsigned int N>
71  struct For
72  {
73  template <class PointerTuple, class ValTuple>
74  static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
75  {
76  thrust::get<I>(smem)[tid] = thrust::get<I>(val);
77 
78  For<I + 1, N>::loadToSmem(smem, val, tid);
79  }
80  template <class PointerTuple, class ValTuple>
81  static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
82  {
83  thrust::get<I>(val) = thrust::get<I>(smem)[tid];
84 
85  For<I + 1, N>::loadFromSmem(smem, val, tid);
86  }
87 
88  template <class PointerTuple, class ValTuple, class OpTuple>
89  static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
90  {
91  typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
92  thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
93 
94  For<I + 1, N>::merge(smem, val, tid, delta, op);
95  }
96  template <class ValTuple, class OpTuple>
97  static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
98  {
99  typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
100  thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
101 
102  For<I + 1, N>::mergeShfl(val, delta, width, op);
103  }
104  };
105  template <unsigned int N>
106  struct For<N, N>
107  {
108  template <class PointerTuple, class ValTuple>
109  static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
110  {
111  }
112  template <class PointerTuple, class ValTuple>
113  static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
114  {
115  }
116 
117  template <class PointerTuple, class ValTuple, class OpTuple>
118  static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
119  {
120  }
121  template <class ValTuple, class OpTuple>
122  static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
123  {
124  }
125  };
126 
127  template <typename T>
128  __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
129  {
130  smem[tid] = val;
131  }
132  template <typename T>
133  __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
134  {
135  val = smem[tid];
136  }
137 
138  template <typename T, class Op>
139  __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
140  {
141  T reg = smem[tid + delta];
142  smem[tid] = val = op(val, reg);
143  }
144 
145  template <typename T, class Op>
146  __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
147  {
148  T reg = shfl_down(val, delta, width);
149  val = op(val, reg);
150  }
151 
152 #if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
153  template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
154  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
155  __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
156  const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
157  unsigned int tid)
158  {
159  For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
160  }
161 
162  template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
163  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
164  __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
165  const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
166  unsigned int tid)
167  {
168  For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
169  }
170 
171  template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
172  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
173  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
174  __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
175  const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
176  unsigned int tid,
177  unsigned int delta,
178  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
179  {
180  For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
181  }
182  template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
183  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
184  __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
185  unsigned int delta,
186  unsigned int width,
187  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
188  {
189  For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
190  }
191 #else
192  template <typename... P, typename... R>
193  __device__ __forceinline__ void loadToSmem(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid)
194  {
195  For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::loadToSmem(smem, val, tid);
196  }
197 
198  template <typename... P, typename... R>
199  __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid)
200  {
201  For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::loadFromSmem(smem, val, tid);
202  }
203 
204  template <typename... P, typename... R, class... Op>
205  __device__ __forceinline__ void merge(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid, unsigned int delta, const thrust::tuple<Op...>& op)
206  {
207  For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::merge(smem, val, tid, delta, op);
208  }
209 
210  template <typename... R, class... Op>
211  __device__ __forceinline__ void mergeShfl(const thrust::tuple<R...>& val, unsigned int delta, unsigned int width, const thrust::tuple<Op...>& op)
212  {
213  For<0, thrust::tuple_size<thrust::tuple<R...> >::value>::mergeShfl(val, delta, width, op);
214  }
215 #endif
216  template <unsigned int N> struct Generic
217  {
218  template <typename Pointer, typename Reference, class Op>
219  static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
220  {
221  loadToSmem(smem, val, tid);
222  if (N >= 32)
223  __syncthreads();
224 
225  if (N >= 2048)
226  {
227  if (tid < 1024)
228  merge(smem, val, tid, 1024, op);
229 
230  __syncthreads();
231  }
232  if (N >= 1024)
233  {
234  if (tid < 512)
235  merge(smem, val, tid, 512, op);
236 
237  __syncthreads();
238  }
239  if (N >= 512)
240  {
241  if (tid < 256)
242  merge(smem, val, tid, 256, op);
243 
244  __syncthreads();
245  }
246  if (N >= 256)
247  {
248  if (tid < 128)
249  merge(smem, val, tid, 128, op);
250 
251  __syncthreads();
252  }
253  if (N >= 128)
254  {
255  if (tid < 64)
256  merge(smem, val, tid, 64, op);
257 
258  __syncthreads();
259  }
260  if (N >= 64)
261  {
262  if (tid < 32)
263  merge(smem, val, tid, 32, op);
264  }
265 
266  if (tid < 16)
267  {
268  merge(smem, val, tid, 16, op);
269  merge(smem, val, tid, 8, op);
270  merge(smem, val, tid, 4, op);
271  merge(smem, val, tid, 2, op);
272  merge(smem, val, tid, 1, op);
273  }
274  }
275  };
276 
277  template <unsigned int I, typename Pointer, typename Reference, class Op>
278  struct Unroll
279  {
280  static __device__ void loopShfl(Reference val, Op op, unsigned int N)
281  {
282  mergeShfl(val, I, N, op);
283  Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
284  }
285  static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
286  {
287  merge(smem, val, tid, I, op);
288  Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
289  }
290  };
291  template <typename Pointer, typename Reference, class Op>
292  struct Unroll<0, Pointer, Reference, Op>
293  {
294  static __device__ void loopShfl(Reference, Op, unsigned int)
295  {
296  }
297  static __device__ void loop(Pointer, Reference, unsigned int, Op)
298  {
299  }
300  };
301 
302  template <unsigned int N> struct WarpOptimized
303  {
304  template <typename Pointer, typename Reference, class Op>
305  static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
306  {
307  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
308  CV_UNUSED(smem);
309  CV_UNUSED(tid);
310 
311  Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
312  #else
313  loadToSmem(smem, val, tid);
314 
315  if (tid < N / 2)
316  Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
317  #endif
318  }
319  };
320 
321  template <unsigned int N> struct GenericOptimized32
322  {
323  enum { M = N / 32 };
324 
325  template <typename Pointer, typename Reference, class Op>
326  static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
327  {
328  const unsigned int laneId = Warp::laneId();
329 
330  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
331  Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
332 
333  if (laneId == 0)
334  loadToSmem(smem, val, tid / 32);
335  #else
336  loadToSmem(smem, val, tid);
337 
338  if (laneId < 16)
339  Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
340 
341  __syncthreads();
342 
343  if (laneId == 0)
344  loadToSmem(smem, val, tid / 32);
345  #endif
346 
347  __syncthreads();
348 
349  loadFromSmem(smem, val, tid);
350 
351  if (tid < 32)
352  {
353  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
354  Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
355  #else
356  Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
357  #endif
358  }
359  }
360  };
361 
362  template <bool val, class T1, class T2> struct StaticIf;
363  template <class T1, class T2> struct StaticIf<true, T1, T2>
364  {
365  typedef T1 type;
366  };
367  template <class T1, class T2> struct StaticIf<false, T1, T2>
368  {
369  typedef T2 type;
370  };
371 
372  template <unsigned int N> struct IsPowerOf2
373  {
374  enum { value = ((N != 0) && !(N & (N - 1))) };
375  };
376 
377  template <unsigned int N> struct Dispatcher
378  {
379  typedef typename StaticIf<
381  WarpOptimized<N>,
382  typename StaticIf<
384  GenericOptimized32<N>,
385  Generic<N>
386  >::type
387  >::type reductor;
388  };
389  }
390 }}}
391 
393 
394 #endif // OPENCV_CUDA_REDUCE_DETAIL_HPP
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray T
Definition: calib3d.hpp:1867
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray R
Definition: calib3d.hpp:1867
CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype=-1)
Reduces a matrix to a vector.
CV_EXPORTS void merge(const Mat *mv, size_t count, OutputArray dst)
Creates one multi-channel array out of several single-channel ones.
int CvScalar value
Definition: core_c.h:720
int int type
Definition: core_c.h:221
CvArr CvPoint2D32f double M
Definition: imgproc_c.h:270
CvSize int int int CvPoint int delta
Definition: imgproc_c.h:1168
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441