EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
simd_functions.hpp
1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5// By downloading, copying, installing or using the software you agree to this license.
6// If you do not agree to this license, do not download, install,
7// copy or use the software.
8//
9//
10// License Agreement
11// For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// Redistribution and use in source and binary forms, with or without modification,
18// are permitted provided that the following conditions are met:
19//
20// * Redistribution's of source code must retain the above copyright notice,
21// this list of conditions and the following disclaimer.
22//
23// * Redistribution's in binary form must reproduce the above copyright notice,
24// this list of conditions and the following disclaimer in the documentation
25// and/or other materials provided with the distribution.
26//
27// * The name of the copyright holders may not be used to endorse or promote products
28// derived from this software without specific prior written permission.
29//
30// This software is provided by the copyright holders and contributors "as is" and
31// any express or implied warranties, including, but not limited to, the implied
32// warranties of merchantability and fitness for a particular purpose are disclaimed.
33// In no event shall the Intel Corporation or contributors be liable for any direct,
34// indirect, incidental, special, exemplary, or consequential damages
35// (including, but not limited to, procurement of substitute goods or services;
36// loss of use, data, or profits; or business interruption) however caused
37// and on any theory of liability, whether in contract, strict liability,
38// or tort (including negligence or otherwise) arising in any way out of
39// the use of this software, even if advised of the possibility of such damage.
40//
41//M*/
42
43/*
44 * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
45 *
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions are met:
48 *
49 * Redistributions of source code must retain the above copyright notice,
50 * this list of conditions and the following disclaimer.
51 *
52 * Redistributions in binary form must reproduce the above copyright notice,
53 * this list of conditions and the following disclaimer in the documentation
54 * and/or other materials provided with the distribution.
55 *
56 * Neither the name of NVIDIA Corporation nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
61 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
64 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
65 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
66 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
67 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
68 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
70 * POSSIBILITY OF SUCH DAMAGE.
71 */
72
73#ifndef OPENCV_CUDA_SIMD_FUNCTIONS_HPP
74#define OPENCV_CUDA_SIMD_FUNCTIONS_HPP
75
76#include "common.hpp"
77
83
84namespace cv { namespace cuda { namespace device
85{
86 // 2
87
88 static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
89 {
90 unsigned int r = 0;
91
92 #if __CUDA_ARCH__ >= 300
93 asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
94 #elif __CUDA_ARCH__ >= 200
95 asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
96 asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
97 #else
98 unsigned int s;
99 s = a ^ b; // sum bits
100 r = a + b; // actual sum
101 s = s ^ r; // determine carry-ins for each bit position
102 s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
103 r = r - s; // subtract out carry-out from low word
104 #endif
105
106 return r;
107 }
108
109 static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
110 {
111 unsigned int r = 0;
112
113 #if __CUDA_ARCH__ >= 300
114 asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
115 #elif __CUDA_ARCH__ >= 200
116 asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
117 asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
118 #else
119 unsigned int s;
120 s = a ^ b; // sum bits
121 r = a - b; // actual sum
122 s = s ^ r; // determine carry-ins for each bit position
123 s = s & 0x00010000; // borrow to high word
124 r = r + s; // compensate for borrow from low word
125 #endif
126
127 return r;
128 }
129
130 static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
131 {
132 unsigned int r = 0;
133
134 #if __CUDA_ARCH__ >= 300
135 asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
136 #elif __CUDA_ARCH__ >= 200
137 asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
138 asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
139 #else
140 unsigned int s, t, u, v;
141 s = a & 0x0000ffff; // extract low halfword
142 r = b & 0x0000ffff; // extract low halfword
143 u = ::max(r, s); // maximum of low halfwords
144 v = ::min(r, s); // minimum of low halfwords
145 s = a & 0xffff0000; // extract high halfword
146 r = b & 0xffff0000; // extract high halfword
147 t = ::max(r, s); // maximum of high halfwords
148 s = ::min(r, s); // minimum of high halfwords
149 r = u | t; // maximum of both halfwords
150 s = v | s; // minimum of both halfwords
151 r = r - s; // |a - b| = max(a,b) - min(a,b);
152 #endif
153
154 return r;
155 }
156
157 static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
158 {
159 unsigned int r, s;
160
161 // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
162 // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
163 s = a ^ b;
164 r = a & b;
165 s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
166 s = s >> 1;
167 s = r + s;
168
169 return s;
170 }
171
172 static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
173 {
174 unsigned int r = 0;
175
176 #if __CUDA_ARCH__ >= 300
177 asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
178 #else
179 // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
180 // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
181 unsigned int s;
182 s = a ^ b;
183 r = a | b;
184 s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
185 s = s >> 1;
186 r = r - s;
187 #endif
188
189 return r;
190 }
191
192 static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
193 {
194 unsigned int r = 0;
195
196 #if __CUDA_ARCH__ >= 300
197 asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
198 #else
199 // inspired by Alan Mycroft's null-byte detection algorithm:
200 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
201 unsigned int c;
202 r = a ^ b; // 0x0000 if a == b
203 c = r | 0x80008000; // set msbs, to catch carry out
204 r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
205 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
206 c = r & ~c; // msb = 1, if r was 0x0000
207 r = c >> 15; // convert to bool
208 #endif
209
210 return r;
211 }
212
213 static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
214 {
215 unsigned int r, c;
216
217 #if __CUDA_ARCH__ >= 300
218 r = vseteq2(a, b);
219 c = r << 16; // convert bool
220 r = c - r; // into mask
221 #else
222 // inspired by Alan Mycroft's null-byte detection algorithm:
223 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
224 r = a ^ b; // 0x0000 if a == b
225 c = r | 0x80008000; // set msbs, to catch carry out
226 r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
227 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
228 c = r & ~c; // msb = 1, if r was 0x0000
229 r = c >> 15; // convert
230 r = c - r; // msbs to
231 r = c | r; // mask
232 #endif
233
234 return r;
235 }
236
237 static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
238 {
239 unsigned int r = 0;
240
241 #if __CUDA_ARCH__ >= 300
242 asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
243 #else
244 unsigned int c;
245 asm("not.b32 %0, %0;" : "+r"(b));
246 c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
247 c = c & 0x80008000; // msb = carry-outs
248 r = c >> 15; // convert to bool
249 #endif
250
251 return r;
252 }
253
254 static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
255 {
256 unsigned int r, c;
257
258 #if __CUDA_ARCH__ >= 300
259 r = vsetge2(a, b);
260 c = r << 16; // convert bool
261 r = c - r; // into mask
262 #else
263 asm("not.b32 %0, %0;" : "+r"(b));
264 c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
265 c = c & 0x80008000; // msb = carry-outs
266 r = c >> 15; // convert
267 r = c - r; // msbs to
268 r = c | r; // mask
269 #endif
270
271 return r;
272 }
273
274 static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
275 {
276 unsigned int r = 0;
277
278 #if __CUDA_ARCH__ >= 300
279 asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
280 #else
281 unsigned int c;
282 asm("not.b32 %0, %0;" : "+r"(b));
283 c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
284 c = c & 0x80008000; // msbs = carry-outs
285 r = c >> 15; // convert to bool
286 #endif
287
288 return r;
289 }
290
291 static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
292 {
293 unsigned int r, c;
294
295 #if __CUDA_ARCH__ >= 300
296 r = vsetgt2(a, b);
297 c = r << 16; // convert bool
298 r = c - r; // into mask
299 #else
300 asm("not.b32 %0, %0;" : "+r"(b));
301 c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
302 c = c & 0x80008000; // msbs = carry-outs
303 r = c >> 15; // convert
304 r = c - r; // msbs to
305 r = c | r; // mask
306 #endif
307
308 return r;
309 }
310
311 static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
312 {
313 unsigned int r = 0;
314
315 #if __CUDA_ARCH__ >= 300
316 asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
317 #else
318 unsigned int c;
319 asm("not.b32 %0, %0;" : "+r"(a));
320 c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
321 c = c & 0x80008000; // msb = carry-outs
322 r = c >> 15; // convert to bool
323 #endif
324
325 return r;
326 }
327
328 static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
329 {
330 unsigned int r, c;
331
332 #if __CUDA_ARCH__ >= 300
333 r = vsetle2(a, b);
334 c = r << 16; // convert bool
335 r = c - r; // into mask
336 #else
337 asm("not.b32 %0, %0;" : "+r"(a));
338 c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
339 c = c & 0x80008000; // msb = carry-outs
340 r = c >> 15; // convert
341 r = c - r; // msbs to
342 r = c | r; // mask
343 #endif
344
345 return r;
346 }
347
348 static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
349 {
350 unsigned int r = 0;
351
352 #if __CUDA_ARCH__ >= 300
353 asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
354 #else
355 unsigned int c;
356 asm("not.b32 %0, %0;" : "+r"(a));
357 c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
358 c = c & 0x80008000; // msb = carry-outs
359 r = c >> 15; // convert to bool
360 #endif
361
362 return r;
363 }
364
365 static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
366 {
367 unsigned int r, c;
368
369 #if __CUDA_ARCH__ >= 300
370 r = vsetlt2(a, b);
371 c = r << 16; // convert bool
372 r = c - r; // into mask
373 #else
374 asm("not.b32 %0, %0;" : "+r"(a));
375 c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
376 c = c & 0x80008000; // msb = carry-outs
377 r = c >> 15; // convert
378 r = c - r; // msbs to
379 r = c | r; // mask
380 #endif
381
382 return r;
383 }
384
385 static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
386 {
387 unsigned int r = 0;
388
389 #if __CUDA_ARCH__ >= 300
390 asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
391 #else
392 // inspired by Alan Mycroft's null-byte detection algorithm:
393 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
394 unsigned int c;
395 r = a ^ b; // 0x0000 if a == b
396 c = r | 0x80008000; // set msbs, to catch carry out
397 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
398 c = r | c; // msb = 1, if r was not 0x0000
399 c = c & 0x80008000; // extract msbs
400 r = c >> 15; // convert to bool
401 #endif
402
403 return r;
404 }
405
406 static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
407 {
408 unsigned int r, c;
409
410 #if __CUDA_ARCH__ >= 300
411 r = vsetne2(a, b);
412 c = r << 16; // convert bool
413 r = c - r; // into mask
414 #else
415 // inspired by Alan Mycroft's null-byte detection algorithm:
416 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
417 r = a ^ b; // 0x0000 if a == b
418 c = r | 0x80008000; // set msbs, to catch carry out
419 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
420 c = r | c; // msb = 1, if r was not 0x0000
421 c = c & 0x80008000; // extract msbs
422 r = c >> 15; // convert
423 r = c - r; // msbs to
424 r = c | r; // mask
425 #endif
426
427 return r;
428 }
429
430 static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
431 {
432 unsigned int r = 0;
433
434 #if __CUDA_ARCH__ >= 300
435 asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
436 #elif __CUDA_ARCH__ >= 200
437 asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
438 asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
439 #else
440 unsigned int s, t, u;
441 r = a & 0x0000ffff; // extract low halfword
442 s = b & 0x0000ffff; // extract low halfword
443 t = ::max(r, s); // maximum of low halfwords
444 r = a & 0xffff0000; // extract high halfword
445 s = b & 0xffff0000; // extract high halfword
446 u = ::max(r, s); // maximum of high halfwords
447 r = t | u; // combine halfword maximums
448 #endif
449
450 return r;
451 }
452
453 static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
454 {
455 unsigned int r = 0;
456
457 #if __CUDA_ARCH__ >= 300
458 asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
459 #elif __CUDA_ARCH__ >= 200
460 asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
461 asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
462 #else
463 unsigned int s, t, u;
464 r = a & 0x0000ffff; // extract low halfword
465 s = b & 0x0000ffff; // extract low halfword
466 t = ::min(r, s); // minimum of low halfwords
467 r = a & 0xffff0000; // extract high halfword
468 s = b & 0xffff0000; // extract high halfword
469 u = ::min(r, s); // minimum of high halfwords
470 r = t | u; // combine halfword minimums
471 #endif
472
473 return r;
474 }
475
476 // 4
477
478 static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
479 {
480 unsigned int r = 0;
481
482 #if __CUDA_ARCH__ >= 300
483 asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
484 #elif __CUDA_ARCH__ >= 200
485 asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
486 asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
487 asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
488 asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
489 #else
490 unsigned int s, t;
491 s = a ^ b; // sum bits
492 r = a & 0x7f7f7f7f; // clear msbs
493 t = b & 0x7f7f7f7f; // clear msbs
494 s = s & 0x80808080; // msb sum bits
495 r = r + t; // add without msbs, record carry-out in msbs
496 r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
497 #endif /* __CUDA_ARCH__ >= 300 */
498
499 return r;
500 }
501
502 static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
503 {
504 unsigned int r = 0;
505
506 #if __CUDA_ARCH__ >= 300
507 asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
508 #elif __CUDA_ARCH__ >= 200
509 asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
510 asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
511 asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
512 asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
513 #else
514 unsigned int s, t;
515 s = a ^ ~b; // inverted sum bits
516 r = a | 0x80808080; // set msbs
517 t = b & 0x7f7f7f7f; // clear msbs
518 s = s & 0x80808080; // inverted msb sum bits
519 r = r - t; // subtract w/o msbs, record inverted borrows in msb
520 r = r ^ s; // combine inverted msb sum bits and borrows
521 #endif
522
523 return r;
524 }
525
526 static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
527 {
528 unsigned int r, s;
529
530 // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
531 // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
532 s = a ^ b;
533 r = a & b;
534 s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
535 s = s >> 1;
536 s = r + s;
537
538 return s;
539 }
540
541 static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
542 {
543 unsigned int r = 0;
544
545 #if __CUDA_ARCH__ >= 300
546 asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
547 #else
548 // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
549 // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
550 unsigned int c;
551 c = a ^ b;
552 r = a | b;
553 c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
554 c = c >> 1;
555 r = r - c;
556 #endif
557
558 return r;
559 }
560
561 static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
562 {
563 unsigned int r = 0;
564
565 #if __CUDA_ARCH__ >= 300
566 asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
567 #else
568 // inspired by Alan Mycroft's null-byte detection algorithm:
569 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
570 unsigned int c;
571 r = a ^ b; // 0x00 if a == b
572 c = r | 0x80808080; // set msbs, to catch carry out
573 r = r ^ c; // extract msbs, msb = 1 if r < 0x80
574 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
575 c = r & ~c; // msb = 1, if r was 0x00
576 r = c >> 7; // convert to bool
577 #endif
578
579 return r;
580 }
581
582 static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
583 {
584 unsigned int r, t;
585
586 #if __CUDA_ARCH__ >= 300
587 r = vseteq4(a, b);
588 t = r << 8; // convert bool
589 r = t - r; // to mask
590 #else
591 // inspired by Alan Mycroft's null-byte detection algorithm:
592 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
593 t = a ^ b; // 0x00 if a == b
594 r = t | 0x80808080; // set msbs, to catch carry out
595 t = t ^ r; // extract msbs, msb = 1 if t < 0x80
596 r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
597 r = t & ~r; // msb = 1, if t was 0x00
598 t = r >> 7; // build mask
599 t = r - t; // from
600 r = t | r; // msbs
601 #endif
602
603 return r;
604 }
605
606 static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
607 {
608 unsigned int r = 0;
609
610 #if __CUDA_ARCH__ >= 300
611 asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
612 #else
613 unsigned int c;
614 asm("not.b32 %0, %0;" : "+r"(a));
615 c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
616 c = c & 0x80808080; // msb = carry-outs
617 r = c >> 7; // convert to bool
618 #endif
619
620 return r;
621 }
622
623 static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
624 {
625 unsigned int r, c;
626
627 #if __CUDA_ARCH__ >= 300
628 r = vsetle4(a, b);
629 c = r << 8; // convert bool
630 r = c - r; // to mask
631 #else
632 asm("not.b32 %0, %0;" : "+r"(a));
633 c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
634 c = c & 0x80808080; // msbs = carry-outs
635 r = c >> 7; // convert
636 r = c - r; // msbs to
637 r = c | r; // mask
638 #endif
639
640 return r;
641 }
642
643 static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
644 {
645 unsigned int r = 0;
646
647 #if __CUDA_ARCH__ >= 300
648 asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
649 #else
650 unsigned int c;
651 asm("not.b32 %0, %0;" : "+r"(a));
652 c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
653 c = c & 0x80808080; // msb = carry-outs
654 r = c >> 7; // convert to bool
655 #endif
656
657 return r;
658 }
659
660 static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
661 {
662 unsigned int r, c;
663
664 #if __CUDA_ARCH__ >= 300
665 r = vsetlt4(a, b);
666 c = r << 8; // convert bool
667 r = c - r; // to mask
668 #else
669 asm("not.b32 %0, %0;" : "+r"(a));
670 c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
671 c = c & 0x80808080; // msbs = carry-outs
672 r = c >> 7; // convert
673 r = c - r; // msbs to
674 r = c | r; // mask
675 #endif
676
677 return r;
678 }
679
680 static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
681 {
682 unsigned int r = 0;
683
684 #if __CUDA_ARCH__ >= 300
685 asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
686 #else
687 unsigned int c;
688 asm("not.b32 %0, %0;" : "+r"(b));
689 c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
690 c = c & 0x80808080; // msb = carry-outs
691 r = c >> 7; // convert to bool
692 #endif
693
694 return r;
695 }
696
697 static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
698 {
699 unsigned int r, s;
700
701 #if __CUDA_ARCH__ >= 300
702 r = vsetge4(a, b);
703 s = r << 8; // convert bool
704 r = s - r; // to mask
705 #else
706 asm ("not.b32 %0,%0;" : "+r"(b));
707 r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
708 r = r & 0x80808080; // msb = carry-outs
709 s = r >> 7; // build mask
710 s = r - s; // from
711 r = s | r; // msbs
712 #endif
713
714 return r;
715 }
716
717 static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
718 {
719 unsigned int r = 0;
720
721 #if __CUDA_ARCH__ >= 300
722 asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
723 #else
724 unsigned int c;
725 asm("not.b32 %0, %0;" : "+r"(b));
726 c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
727 c = c & 0x80808080; // msb = carry-outs
728 r = c >> 7; // convert to bool
729 #endif
730
731 return r;
732 }
733
734 static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
735 {
736 unsigned int r, c;
737
738 #if __CUDA_ARCH__ >= 300
739 r = vsetgt4(a, b);
740 c = r << 8; // convert bool
741 r = c - r; // to mask
742 #else
743 asm("not.b32 %0, %0;" : "+r"(b));
744 c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
745 c = c & 0x80808080; // msb = carry-outs
746 r = c >> 7; // convert
747 r = c - r; // msbs to
748 r = c | r; // mask
749 #endif
750
751 return r;
752 }
753
754 static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
755 {
756 unsigned int r = 0;
757
758 #if __CUDA_ARCH__ >= 300
759 asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
760 #else
761 // inspired by Alan Mycroft's null-byte detection algorithm:
762 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
763 unsigned int c;
764 r = a ^ b; // 0x00 if a == b
765 c = r | 0x80808080; // set msbs, to catch carry out
766 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
767 c = r | c; // msb = 1, if r was not 0x00
768 c = c & 0x80808080; // extract msbs
769 r = c >> 7; // convert to bool
770 #endif
771
772 return r;
773 }
774
775 static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
776 {
777 unsigned int r, c;
778
779 #if __CUDA_ARCH__ >= 300
780 r = vsetne4(a, b);
781 c = r << 8; // convert bool
782 r = c - r; // to mask
783 #else
784 // inspired by Alan Mycroft's null-byte detection algorithm:
785 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
786 r = a ^ b; // 0x00 if a == b
787 c = r | 0x80808080; // set msbs, to catch carry out
788 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
789 c = r | c; // msb = 1, if r was not 0x00
790 c = c & 0x80808080; // extract msbs
791 r = c >> 7; // convert
792 r = c - r; // msbs to
793 r = c | r; // mask
794 #endif
795
796 return r;
797 }
798
799 static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
800 {
801 unsigned int r = 0;
802
803 #if __CUDA_ARCH__ >= 300
804 asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
805 #elif __CUDA_ARCH__ >= 200
806 asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
807 asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
808 asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
809 asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
810 #else
811 unsigned int s;
812 s = vcmpge4(a, b); // mask = 0xff if a >= b
813 r = a ^ b; //
814 s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
815 r = s ^ r; // select a when b >= a, else select b => min(a,b)
816 r = s - r; // |a - b| = max(a,b) - min(a,b);
817 #endif
818
819 return r;
820 }
821
822 static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
823 {
824 unsigned int r = 0;
825
826 #if __CUDA_ARCH__ >= 300
827 asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
828 #elif __CUDA_ARCH__ >= 200
829 asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
830 asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
831 asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
832 asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
833 #else
834 unsigned int s;
835 s = vcmpge4(a, b); // mask = 0xff if a >= b
836 r = a & s; // select a when b >= a
837 s = b & ~s; // select b when b < a
838 r = r | s; // combine byte selections
839 #endif
840
841 return r; // byte-wise unsigned maximum
842 }
843
844 static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
845 {
846 unsigned int r = 0;
847
848 #if __CUDA_ARCH__ >= 300
849 asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
850 #elif __CUDA_ARCH__ >= 200
851 asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
852 asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
853 asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
854 asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
855 #else
856 unsigned int s;
857 s = vcmpge4(b, a); // mask = 0xff if a >= b
858 r = a & s; // select a when b >= a
859 s = b & ~s; // select b when b < a
860 r = r | s; // combine byte selections
861 #endif
862
863 return r;
864 }
865}}}
866
868
869#endif // OPENCV_CUDA_SIMD_FUNCTIONS_HPP
CvRect r
Definition imgproc_c.h:984
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441