43 #ifndef OPENCV_CUDA_TRANSFORM_DETAIL_HPP
44 #define OPENCV_CUDA_TRANSFORM_DETAIL_HPP
46 #include "../common.hpp"
47 #include "../vec_traits.hpp"
48 #include "../functional.hpp"
52 namespace cv {
namespace cuda {
namespace device
54 namespace transform_detail
58 template <
typename T,
typename D,
int shift>
struct UnaryReadWriteTraits
60 typedef typename TypeVec<T, shift>::vec_type read_type;
61 typedef typename TypeVec<D, shift>::vec_type write_type;
64 template <
typename T1,
typename T2,
typename D,
int shift>
struct BinaryReadWriteTraits
66 typedef typename TypeVec<T1, shift>::vec_type read_type1;
67 typedef typename TypeVec<T2, shift>::vec_type read_type2;
68 typedef typename TypeVec<D, shift>::vec_type write_type;
73 template <
int shift>
struct OpUnroller;
74 template <>
struct OpUnroller<1>
76 template <
typename T,
typename D,
typename UnOp,
typename Mask>
77 static __device__ __forceinline__
void unroll(
const T& src, D&
dst,
const Mask&
mask, UnOp& op,
int x_shifted,
int y)
79 if (
mask(
y, x_shifted))
83 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
84 static __device__ __forceinline__
void unroll(
const T1&
src1,
const T2&
src2, D&
dst,
const Mask&
mask, BinOp& op,
int x_shifted,
int y)
86 if (
mask(
y, x_shifted))
90 template <>
struct OpUnroller<2>
92 template <
typename T,
typename D,
typename UnOp,
typename Mask>
93 static __device__ __forceinline__
void unroll(
const T& src, D&
dst,
const Mask&
mask, UnOp& op,
int x_shifted,
int y)
95 if (
mask(
y, x_shifted))
97 if (
mask(
y, x_shifted + 1))
101 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
102 static __device__ __forceinline__
void unroll(
const T1&
src1,
const T2&
src2, D&
dst,
const Mask&
mask, BinOp& op,
int x_shifted,
int y)
104 if (
mask(
y, x_shifted))
106 if (
mask(
y, x_shifted + 1))
110 template <>
struct OpUnroller<3>
112 template <
typename T,
typename D,
typename UnOp,
typename Mask>
113 static __device__ __forceinline__
void unroll(
const T& src, D&
dst,
const Mask&
mask,
const UnOp& op,
int x_shifted,
int y)
115 if (
mask(
y, x_shifted))
117 if (
mask(
y, x_shifted + 1))
119 if (
mask(
y, x_shifted + 2))
123 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
124 static __device__ __forceinline__
void unroll(
const T1&
src1,
const T2&
src2, D&
dst,
const Mask&
mask,
const BinOp& op,
int x_shifted,
int y)
126 if (
mask(
y, x_shifted))
128 if (
mask(
y, x_shifted + 1))
130 if (
mask(
y, x_shifted + 2))
134 template <>
struct OpUnroller<4>
136 template <
typename T,
typename D,
typename UnOp,
typename Mask>
137 static __device__ __forceinline__
void unroll(
const T& src, D&
dst,
const Mask&
mask,
const UnOp& op,
int x_shifted,
int y)
139 if (
mask(
y, x_shifted))
141 if (
mask(
y, x_shifted + 1))
143 if (
mask(
y, x_shifted + 2))
145 if (
mask(
y, x_shifted + 3))
149 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
150 static __device__ __forceinline__
void unroll(
const T1&
src1,
const T2&
src2, D&
dst,
const Mask&
mask,
const BinOp& op,
int x_shifted,
int y)
152 if (
mask(
y, x_shifted))
154 if (
mask(
y, x_shifted + 1))
156 if (
mask(
y, x_shifted + 2))
158 if (
mask(
y, x_shifted + 3))
162 template <>
struct OpUnroller<8>
164 template <
typename T,
typename D,
typename UnOp,
typename Mask>
165 static __device__ __forceinline__
void unroll(
const T& src, D&
dst,
const Mask&
mask,
const UnOp& op,
int x_shifted,
int y)
167 if (
mask(
y, x_shifted))
169 if (
mask(
y, x_shifted + 1))
171 if (
mask(
y, x_shifted + 2))
173 if (
mask(
y, x_shifted + 3))
175 if (
mask(
y, x_shifted + 4))
177 if (
mask(
y, x_shifted + 5))
179 if (
mask(
y, x_shifted + 6))
181 if (
mask(
y, x_shifted + 7))
185 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
186 static __device__ __forceinline__
void unroll(
const T1&
src1,
const T2&
src2, D&
dst,
const Mask&
mask,
const BinOp& op,
int x_shifted,
int y)
188 if (
mask(
y, x_shifted))
190 if (
mask(
y, x_shifted + 1))
192 if (
mask(
y, x_shifted + 2))
194 if (
mask(
y, x_shifted + 3))
196 if (
mask(
y, x_shifted + 4))
198 if (
mask(
y, x_shifted + 5))
200 if (
mask(
y, x_shifted + 6))
202 if (
mask(
y, x_shifted + 7))
207 template <
typename T,
typename D,
typename UnOp,
typename Mask>
208 static __global__
void transformSmart(
const PtrStepSz<T> src_, PtrStep<D> dst_,
const Mask
mask,
const UnOp op)
210 typedef TransformFunctorTraits<UnOp> ft;
211 typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
212 typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
214 const int x = threadIdx.x + blockIdx.x * blockDim.x;
215 const int y = threadIdx.y + blockIdx.y * blockDim.y;
216 const int x_shifted =
x * ft::smart_shift;
220 const T* src = src_.ptr(
y);
221 D*
dst = dst_.ptr(
y);
223 if (x_shifted + ft::smart_shift - 1 < src_.cols)
225 const read_type src_n_el = ((
const read_type*)src)[
x];
226 OpUnroller<ft::smart_shift>::unroll(src_n_el, ((write_type*)
dst)[
x],
mask, op, x_shifted,
y);
230 for (
int real_x = x_shifted; real_x < src_.cols; ++real_x)
233 dst[real_x] = op(src[real_x]);
239 template <
typename T,
typename D,
typename UnOp,
typename Mask>
240 __global__
static void transformSimple(
const PtrStepSz<T> src, PtrStep<D>
dst,
const Mask
mask,
const UnOp op)
242 const int x = blockDim.x * blockIdx.x + threadIdx.x;
243 const int y = blockDim.y * blockIdx.y + threadIdx.y;
245 if (
x < src.cols &&
y < src.rows &&
mask(
y,
x))
247 dst.ptr(
y)[
x] = op(src.ptr(
y)[
x]);
251 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
252 static __global__
void transformSmart(
const PtrStepSz<T1> src1_,
const PtrStep<T2> src2_, PtrStep<D> dst_,
253 const Mask
mask,
const BinOp op)
255 typedef TransformFunctorTraits<BinOp> ft;
256 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
257 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
258 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
260 const int x = threadIdx.x + blockIdx.x * blockDim.x;
261 const int y = threadIdx.y + blockIdx.y * blockDim.y;
262 const int x_shifted =
x * ft::smart_shift;
266 const T1*
src1 = src1_.ptr(
y);
267 const T2*
src2 = src2_.ptr(
y);
268 D*
dst = dst_.ptr(
y);
270 if (x_shifted + ft::smart_shift - 1 < src1_.cols)
272 const read_type1 src1_n_el = ((
const read_type1*)
src1)[
x];
273 const read_type2 src2_n_el = ((
const read_type2*)
src2)[
x];
275 OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, ((write_type*)
dst)[
x],
mask, op, x_shifted,
y);
279 for (
int real_x = x_shifted; real_x < src1_.cols; ++real_x)
288 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
289 static __global__
void transformSimple(
const PtrStepSz<T1>
src1,
const PtrStep<T2>
src2, PtrStep<D>
dst,
290 const Mask
mask,
const BinOp op)
292 const int x = blockDim.x * blockIdx.x + threadIdx.x;
293 const int y = blockDim.y * blockIdx.y + threadIdx.y;
297 const T1 src1_data =
src1.ptr(
y)[
x];
298 const T2 src2_data =
src2.ptr(
y)[
x];
299 dst.ptr(
y)[
x] = op(src1_data, src2_data);
303 template <
bool UseSmart>
struct TransformDispatcher;
304 template<>
struct TransformDispatcher<false>
306 template <
typename T,
typename D,
typename UnOp,
typename Mask>
307 static void call(PtrStepSz<T> src, PtrStepSz<D>
dst, UnOp op, Mask
mask, cudaStream_t stream)
309 typedef TransformFunctorTraits<UnOp> ft;
311 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
312 const dim3 grid(
divUp(src.cols, threads.x),
divUp(src.rows, threads.y), 1);
314 transformSimple<T, D><<<grid, threads, 0, stream>>>(src,
dst,
mask, op);
315 cudaSafeCall( cudaGetLastError() );
318 cudaSafeCall( cudaDeviceSynchronize() );
321 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
322 static void call(PtrStepSz<T1>
src1, PtrStepSz<T2>
src2, PtrStepSz<D>
dst, BinOp op, Mask
mask, cudaStream_t stream)
324 typedef TransformFunctorTraits<BinOp> ft;
326 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
329 transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(
src1,
src2,
dst,
mask, op);
330 cudaSafeCall( cudaGetLastError() );
333 cudaSafeCall( cudaDeviceSynchronize() );
336 template<>
struct TransformDispatcher<true>
338 template <
typename T,
typename D,
typename UnOp,
typename Mask>
339 static void call(PtrStepSz<T> src, PtrStepSz<D>
dst, UnOp op, Mask
mask, cudaStream_t stream)
341 typedef TransformFunctorTraits<UnOp> ft;
343 CV_StaticAssert(ft::smart_shift != 1,
"");
345 if (!
isAligned(src.data, ft::smart_shift *
sizeof(
T)) || !
isAligned(src.step, ft::smart_shift *
sizeof(
T)) ||
348 TransformDispatcher<false>::call(src,
dst, op,
mask, stream);
352 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
353 const dim3 grid(
divUp(src.cols, threads.x * ft::smart_shift),
divUp(src.rows, threads.y), 1);
355 transformSmart<T, D><<<grid, threads, 0, stream>>>(src,
dst,
mask, op);
356 cudaSafeCall( cudaGetLastError() );
359 cudaSafeCall( cudaDeviceSynchronize() );
362 template <
typename T1,
typename T2,
typename D,
typename BinOp,
typename Mask>
363 static void call(PtrStepSz<T1>
src1, PtrStepSz<T2>
src2, PtrStepSz<D>
dst, BinOp op, Mask
mask, cudaStream_t stream)
365 typedef TransformFunctorTraits<BinOp> ft;
367 CV_StaticAssert(ft::smart_shift != 1,
"");
377 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
378 const dim3 grid(
divUp(
src1.cols, threads.x * ft::smart_shift),
divUp(
src1.rows, threads.y), 1);
380 transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(
src1,
src2,
dst,
mask, op);
381 cudaSafeCall( cudaGetLastError() );
384 cudaSafeCall( cudaDeviceSynchronize() );
InputArrayOfArrays InputArrayOfArrays InputOutputArray InputOutputArray InputOutputArray InputOutputArray Size InputOutputArray InputOutputArray T
Definition: calib3d.hpp:1867
const CvArr const CvArr * src2
Definition: core_c.h:994
const CvArr * src1
Definition: core_c.h:993
const CvArr CvArr * x
Definition: core_c.h:1195
const CvArr * y
Definition: core_c.h:1187
static int divUp(int a, unsigned int b)
Integer division with result round up.
Definition: utility.hpp:482
static bool isAligned(const T &data)
Alignment check of passed values.
Definition: utility.hpp:517
CV_EXPORTS OutputArray int double double InputArray mask
Definition: imgproc.hpp:2132
OutputArray dst
Definition: imgproc.hpp:3564
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441