EstervQrCode 1.1.1
Library for qr code manipulation
simd_functions.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 /*
44  * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
45  *
46  * Redistribution and use in source and binary forms, with or without
47  * modification, are permitted provided that the following conditions are met:
48  *
49  * Redistributions of source code must retain the above copyright notice,
50  * this list of conditions and the following disclaimer.
51  *
52  * Redistributions in binary form must reproduce the above copyright notice,
53  * this list of conditions and the following disclaimer in the documentation
54  * and/or other materials provided with the distribution.
55  *
56  * Neither the name of NVIDIA Corporation nor the names of its contributors
57  * may be used to endorse or promote products derived from this software
58  * without specific prior written permission.
59  *
60  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
61  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
64  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
65  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
66  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
67  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
68  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
70  * POSSIBILITY OF SUCH DAMAGE.
71  */
72 
73 #ifndef OPENCV_CUDA_SIMD_FUNCTIONS_HPP
74 #define OPENCV_CUDA_SIMD_FUNCTIONS_HPP
75 
76 #include "common.hpp"
77 
83 
84 namespace cv { namespace cuda { namespace device
85 {
86  // 2
87 
88  static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
89  {
90  unsigned int r = 0;
91 
92  #if __CUDA_ARCH__ >= 300
93  asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
94  #elif __CUDA_ARCH__ >= 200
95  asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
96  asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
97  #else
98  unsigned int s;
99  s = a ^ b; // sum bits
100  r = a + b; // actual sum
101  s = s ^ r; // determine carry-ins for each bit position
102  s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
103  r = r - s; // subtract out carry-out from low word
104  #endif
105 
106  return r;
107  }
108 
109  static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
110  {
111  unsigned int r = 0;
112 
113  #if __CUDA_ARCH__ >= 300
114  asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
115  #elif __CUDA_ARCH__ >= 200
116  asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
117  asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
118  #else
119  unsigned int s;
120  s = a ^ b; // sum bits
121  r = a - b; // actual sum
122  s = s ^ r; // determine carry-ins for each bit position
123  s = s & 0x00010000; // borrow to high word
124  r = r + s; // compensate for borrow from low word
125  #endif
126 
127  return r;
128  }
129 
130  static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
131  {
132  unsigned int r = 0;
133 
134  #if __CUDA_ARCH__ >= 300
135  asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
136  #elif __CUDA_ARCH__ >= 200
137  asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
138  asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
139  #else
140  unsigned int s, t, u, v;
141  s = a & 0x0000ffff; // extract low halfword
142  r = b & 0x0000ffff; // extract low halfword
143  u = ::max(r, s); // maximum of low halfwords
144  v = ::min(r, s); // minimum of low halfwords
145  s = a & 0xffff0000; // extract high halfword
146  r = b & 0xffff0000; // extract high halfword
147  t = ::max(r, s); // maximum of high halfwords
148  s = ::min(r, s); // minimum of high halfwords
149  r = u | t; // maximum of both halfwords
150  s = v | s; // minimum of both halfwords
151  r = r - s; // |a - b| = max(a,b) - min(a,b);
152  #endif
153 
154  return r;
155  }
156 
157  static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
158  {
159  unsigned int r, s;
160 
161  // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
162  // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
163  s = a ^ b;
164  r = a & b;
165  s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
166  s = s >> 1;
167  s = r + s;
168 
169  return s;
170  }
171 
172  static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
173  {
174  unsigned int r = 0;
175 
176  #if __CUDA_ARCH__ >= 300
177  asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
178  #else
179  // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
180  // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
181  unsigned int s;
182  s = a ^ b;
183  r = a | b;
184  s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
185  s = s >> 1;
186  r = r - s;
187  #endif
188 
189  return r;
190  }
191 
192  static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
193  {
194  unsigned int r = 0;
195 
196  #if __CUDA_ARCH__ >= 300
197  asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
198  #else
199  // inspired by Alan Mycroft's null-byte detection algorithm:
200  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
201  unsigned int c;
202  r = a ^ b; // 0x0000 if a == b
203  c = r | 0x80008000; // set msbs, to catch carry out
204  r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
205  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
206  c = r & ~c; // msb = 1, if r was 0x0000
207  r = c >> 15; // convert to bool
208  #endif
209 
210  return r;
211  }
212 
213  static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
214  {
215  unsigned int r, c;
216 
217  #if __CUDA_ARCH__ >= 300
218  r = vseteq2(a, b);
219  c = r << 16; // convert bool
220  r = c - r; // into mask
221  #else
222  // inspired by Alan Mycroft's null-byte detection algorithm:
223  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
224  r = a ^ b; // 0x0000 if a == b
225  c = r | 0x80008000; // set msbs, to catch carry out
226  r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
227  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
228  c = r & ~c; // msb = 1, if r was 0x0000
229  r = c >> 15; // convert
230  r = c - r; // msbs to
231  r = c | r; // mask
232  #endif
233 
234  return r;
235  }
236 
237  static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
238  {
239  unsigned int r = 0;
240 
241  #if __CUDA_ARCH__ >= 300
242  asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
243  #else
244  unsigned int c;
245  asm("not.b32 %0, %0;" : "+r"(b));
246  c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
247  c = c & 0x80008000; // msb = carry-outs
248  r = c >> 15; // convert to bool
249  #endif
250 
251  return r;
252  }
253 
254  static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
255  {
256  unsigned int r, c;
257 
258  #if __CUDA_ARCH__ >= 300
259  r = vsetge2(a, b);
260  c = r << 16; // convert bool
261  r = c - r; // into mask
262  #else
263  asm("not.b32 %0, %0;" : "+r"(b));
264  c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
265  c = c & 0x80008000; // msb = carry-outs
266  r = c >> 15; // convert
267  r = c - r; // msbs to
268  r = c | r; // mask
269  #endif
270 
271  return r;
272  }
273 
274  static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
275  {
276  unsigned int r = 0;
277 
278  #if __CUDA_ARCH__ >= 300
279  asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
280  #else
281  unsigned int c;
282  asm("not.b32 %0, %0;" : "+r"(b));
283  c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
284  c = c & 0x80008000; // msbs = carry-outs
285  r = c >> 15; // convert to bool
286  #endif
287 
288  return r;
289  }
290 
291  static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
292  {
293  unsigned int r, c;
294 
295  #if __CUDA_ARCH__ >= 300
296  r = vsetgt2(a, b);
297  c = r << 16; // convert bool
298  r = c - r; // into mask
299  #else
300  asm("not.b32 %0, %0;" : "+r"(b));
301  c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
302  c = c & 0x80008000; // msbs = carry-outs
303  r = c >> 15; // convert
304  r = c - r; // msbs to
305  r = c | r; // mask
306  #endif
307 
308  return r;
309  }
310 
311  static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
312  {
313  unsigned int r = 0;
314 
315  #if __CUDA_ARCH__ >= 300
316  asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
317  #else
318  unsigned int c;
319  asm("not.b32 %0, %0;" : "+r"(a));
320  c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
321  c = c & 0x80008000; // msb = carry-outs
322  r = c >> 15; // convert to bool
323  #endif
324 
325  return r;
326  }
327 
328  static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
329  {
330  unsigned int r, c;
331 
332  #if __CUDA_ARCH__ >= 300
333  r = vsetle2(a, b);
334  c = r << 16; // convert bool
335  r = c - r; // into mask
336  #else
337  asm("not.b32 %0, %0;" : "+r"(a));
338  c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
339  c = c & 0x80008000; // msb = carry-outs
340  r = c >> 15; // convert
341  r = c - r; // msbs to
342  r = c | r; // mask
343  #endif
344 
345  return r;
346  }
347 
348  static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
349  {
350  unsigned int r = 0;
351 
352  #if __CUDA_ARCH__ >= 300
353  asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
354  #else
355  unsigned int c;
356  asm("not.b32 %0, %0;" : "+r"(a));
357  c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
358  c = c & 0x80008000; // msb = carry-outs
359  r = c >> 15; // convert to bool
360  #endif
361 
362  return r;
363  }
364 
365  static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
366  {
367  unsigned int r, c;
368 
369  #if __CUDA_ARCH__ >= 300
370  r = vsetlt2(a, b);
371  c = r << 16; // convert bool
372  r = c - r; // into mask
373  #else
374  asm("not.b32 %0, %0;" : "+r"(a));
375  c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
376  c = c & 0x80008000; // msb = carry-outs
377  r = c >> 15; // convert
378  r = c - r; // msbs to
379  r = c | r; // mask
380  #endif
381 
382  return r;
383  }
384 
385  static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
386  {
387  unsigned int r = 0;
388 
389  #if __CUDA_ARCH__ >= 300
390  asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
391  #else
392  // inspired by Alan Mycroft's null-byte detection algorithm:
393  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
394  unsigned int c;
395  r = a ^ b; // 0x0000 if a == b
396  c = r | 0x80008000; // set msbs, to catch carry out
397  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
398  c = r | c; // msb = 1, if r was not 0x0000
399  c = c & 0x80008000; // extract msbs
400  r = c >> 15; // convert to bool
401  #endif
402 
403  return r;
404  }
405 
406  static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
407  {
408  unsigned int r, c;
409 
410  #if __CUDA_ARCH__ >= 300
411  r = vsetne2(a, b);
412  c = r << 16; // convert bool
413  r = c - r; // into mask
414  #else
415  // inspired by Alan Mycroft's null-byte detection algorithm:
416  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
417  r = a ^ b; // 0x0000 if a == b
418  c = r | 0x80008000; // set msbs, to catch carry out
419  c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
420  c = r | c; // msb = 1, if r was not 0x0000
421  c = c & 0x80008000; // extract msbs
422  r = c >> 15; // convert
423  r = c - r; // msbs to
424  r = c | r; // mask
425  #endif
426 
427  return r;
428  }
429 
430  static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
431  {
432  unsigned int r = 0;
433 
434  #if __CUDA_ARCH__ >= 300
435  asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
436  #elif __CUDA_ARCH__ >= 200
437  asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
438  asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
439  #else
440  unsigned int s, t, u;
441  r = a & 0x0000ffff; // extract low halfword
442  s = b & 0x0000ffff; // extract low halfword
443  t = ::max(r, s); // maximum of low halfwords
444  r = a & 0xffff0000; // extract high halfword
445  s = b & 0xffff0000; // extract high halfword
446  u = ::max(r, s); // maximum of high halfwords
447  r = t | u; // combine halfword maximums
448  #endif
449 
450  return r;
451  }
452 
453  static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
454  {
455  unsigned int r = 0;
456 
457  #if __CUDA_ARCH__ >= 300
458  asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
459  #elif __CUDA_ARCH__ >= 200
460  asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
461  asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
462  #else
463  unsigned int s, t, u;
464  r = a & 0x0000ffff; // extract low halfword
465  s = b & 0x0000ffff; // extract low halfword
466  t = ::min(r, s); // minimum of low halfwords
467  r = a & 0xffff0000; // extract high halfword
468  s = b & 0xffff0000; // extract high halfword
469  u = ::min(r, s); // minimum of high halfwords
470  r = t | u; // combine halfword minimums
471  #endif
472 
473  return r;
474  }
475 
476  // 4
477 
478  static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
479  {
480  unsigned int r = 0;
481 
482  #if __CUDA_ARCH__ >= 300
483  asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
484  #elif __CUDA_ARCH__ >= 200
485  asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
486  asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
487  asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
488  asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
489  #else
490  unsigned int s, t;
491  s = a ^ b; // sum bits
492  r = a & 0x7f7f7f7f; // clear msbs
493  t = b & 0x7f7f7f7f; // clear msbs
494  s = s & 0x80808080; // msb sum bits
495  r = r + t; // add without msbs, record carry-out in msbs
496  r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
497  #endif /* __CUDA_ARCH__ >= 300 */
498 
499  return r;
500  }
501 
502  static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
503  {
504  unsigned int r = 0;
505 
506  #if __CUDA_ARCH__ >= 300
507  asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
508  #elif __CUDA_ARCH__ >= 200
509  asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
510  asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
511  asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
512  asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
513  #else
514  unsigned int s, t;
515  s = a ^ ~b; // inverted sum bits
516  r = a | 0x80808080; // set msbs
517  t = b & 0x7f7f7f7f; // clear msbs
518  s = s & 0x80808080; // inverted msb sum bits
519  r = r - t; // subtract w/o msbs, record inverted borrows in msb
520  r = r ^ s; // combine inverted msb sum bits and borrows
521  #endif
522 
523  return r;
524  }
525 
526  static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
527  {
528  unsigned int r, s;
529 
530  // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
531  // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
532  s = a ^ b;
533  r = a & b;
534  s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
535  s = s >> 1;
536  s = r + s;
537 
538  return s;
539  }
540 
541  static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
542  {
543  unsigned int r = 0;
544 
545  #if __CUDA_ARCH__ >= 300
546  asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
547  #else
548  // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
549  // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
550  unsigned int c;
551  c = a ^ b;
552  r = a | b;
553  c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
554  c = c >> 1;
555  r = r - c;
556  #endif
557 
558  return r;
559  }
560 
561  static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
562  {
563  unsigned int r = 0;
564 
565  #if __CUDA_ARCH__ >= 300
566  asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
567  #else
568  // inspired by Alan Mycroft's null-byte detection algorithm:
569  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
570  unsigned int c;
571  r = a ^ b; // 0x00 if a == b
572  c = r | 0x80808080; // set msbs, to catch carry out
573  r = r ^ c; // extract msbs, msb = 1 if r < 0x80
574  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
575  c = r & ~c; // msb = 1, if r was 0x00
576  r = c >> 7; // convert to bool
577  #endif
578 
579  return r;
580  }
581 
582  static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
583  {
584  unsigned int r, t;
585 
586  #if __CUDA_ARCH__ >= 300
587  r = vseteq4(a, b);
588  t = r << 8; // convert bool
589  r = t - r; // to mask
590  #else
591  // inspired by Alan Mycroft's null-byte detection algorithm:
592  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
593  t = a ^ b; // 0x00 if a == b
594  r = t | 0x80808080; // set msbs, to catch carry out
595  t = t ^ r; // extract msbs, msb = 1 if t < 0x80
596  r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
597  r = t & ~r; // msb = 1, if t was 0x00
598  t = r >> 7; // build mask
599  t = r - t; // from
600  r = t | r; // msbs
601  #endif
602 
603  return r;
604  }
605 
606  static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
607  {
608  unsigned int r = 0;
609 
610  #if __CUDA_ARCH__ >= 300
611  asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
612  #else
613  unsigned int c;
614  asm("not.b32 %0, %0;" : "+r"(a));
615  c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
616  c = c & 0x80808080; // msb = carry-outs
617  r = c >> 7; // convert to bool
618  #endif
619 
620  return r;
621  }
622 
623  static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
624  {
625  unsigned int r, c;
626 
627  #if __CUDA_ARCH__ >= 300
628  r = vsetle4(a, b);
629  c = r << 8; // convert bool
630  r = c - r; // to mask
631  #else
632  asm("not.b32 %0, %0;" : "+r"(a));
633  c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
634  c = c & 0x80808080; // msbs = carry-outs
635  r = c >> 7; // convert
636  r = c - r; // msbs to
637  r = c | r; // mask
638  #endif
639 
640  return r;
641  }
642 
643  static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
644  {
645  unsigned int r = 0;
646 
647  #if __CUDA_ARCH__ >= 300
648  asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
649  #else
650  unsigned int c;
651  asm("not.b32 %0, %0;" : "+r"(a));
652  c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
653  c = c & 0x80808080; // msb = carry-outs
654  r = c >> 7; // convert to bool
655  #endif
656 
657  return r;
658  }
659 
660  static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
661  {
662  unsigned int r, c;
663 
664  #if __CUDA_ARCH__ >= 300
665  r = vsetlt4(a, b);
666  c = r << 8; // convert bool
667  r = c - r; // to mask
668  #else
669  asm("not.b32 %0, %0;" : "+r"(a));
670  c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
671  c = c & 0x80808080; // msbs = carry-outs
672  r = c >> 7; // convert
673  r = c - r; // msbs to
674  r = c | r; // mask
675  #endif
676 
677  return r;
678  }
679 
680  static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
681  {
682  unsigned int r = 0;
683 
684  #if __CUDA_ARCH__ >= 300
685  asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
686  #else
687  unsigned int c;
688  asm("not.b32 %0, %0;" : "+r"(b));
689  c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
690  c = c & 0x80808080; // msb = carry-outs
691  r = c >> 7; // convert to bool
692  #endif
693 
694  return r;
695  }
696 
697  static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
698  {
699  unsigned int r, s;
700 
701  #if __CUDA_ARCH__ >= 300
702  r = vsetge4(a, b);
703  s = r << 8; // convert bool
704  r = s - r; // to mask
705  #else
706  asm ("not.b32 %0,%0;" : "+r"(b));
707  r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
708  r = r & 0x80808080; // msb = carry-outs
709  s = r >> 7; // build mask
710  s = r - s; // from
711  r = s | r; // msbs
712  #endif
713 
714  return r;
715  }
716 
717  static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
718  {
719  unsigned int r = 0;
720 
721  #if __CUDA_ARCH__ >= 300
722  asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
723  #else
724  unsigned int c;
725  asm("not.b32 %0, %0;" : "+r"(b));
726  c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
727  c = c & 0x80808080; // msb = carry-outs
728  r = c >> 7; // convert to bool
729  #endif
730 
731  return r;
732  }
733 
734  static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
735  {
736  unsigned int r, c;
737 
738  #if __CUDA_ARCH__ >= 300
739  r = vsetgt4(a, b);
740  c = r << 8; // convert bool
741  r = c - r; // to mask
742  #else
743  asm("not.b32 %0, %0;" : "+r"(b));
744  c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
745  c = c & 0x80808080; // msb = carry-outs
746  r = c >> 7; // convert
747  r = c - r; // msbs to
748  r = c | r; // mask
749  #endif
750 
751  return r;
752  }
753 
754  static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
755  {
756  unsigned int r = 0;
757 
758  #if __CUDA_ARCH__ >= 300
759  asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
760  #else
761  // inspired by Alan Mycroft's null-byte detection algorithm:
762  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
763  unsigned int c;
764  r = a ^ b; // 0x00 if a == b
765  c = r | 0x80808080; // set msbs, to catch carry out
766  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
767  c = r | c; // msb = 1, if r was not 0x00
768  c = c & 0x80808080; // extract msbs
769  r = c >> 7; // convert to bool
770  #endif
771 
772  return r;
773  }
774 
775  static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
776  {
777  unsigned int r, c;
778 
779  #if __CUDA_ARCH__ >= 300
780  r = vsetne4(a, b);
781  c = r << 8; // convert bool
782  r = c - r; // to mask
783  #else
784  // inspired by Alan Mycroft's null-byte detection algorithm:
785  // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
786  r = a ^ b; // 0x00 if a == b
787  c = r | 0x80808080; // set msbs, to catch carry out
788  c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
789  c = r | c; // msb = 1, if r was not 0x00
790  c = c & 0x80808080; // extract msbs
791  r = c >> 7; // convert
792  r = c - r; // msbs to
793  r = c | r; // mask
794  #endif
795 
796  return r;
797  }
798 
799  static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
800  {
801  unsigned int r = 0;
802 
803  #if __CUDA_ARCH__ >= 300
804  asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
805  #elif __CUDA_ARCH__ >= 200
806  asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
807  asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
808  asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
809  asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
810  #else
811  unsigned int s;
812  s = vcmpge4(a, b); // mask = 0xff if a >= b
813  r = a ^ b; //
814  s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
815  r = s ^ r; // select a when b >= a, else select b => min(a,b)
816  r = s - r; // |a - b| = max(a,b) - min(a,b);
817  #endif
818 
819  return r;
820  }
821 
822  static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
823  {
824  unsigned int r = 0;
825 
826  #if __CUDA_ARCH__ >= 300
827  asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
828  #elif __CUDA_ARCH__ >= 200
829  asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
830  asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
831  asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
832  asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
833  #else
834  unsigned int s;
835  s = vcmpge4(a, b); // mask = 0xff if a >= b
836  r = a & s; // select a when b >= a
837  s = b & ~s; // select b when b < a
838  r = r | s; // combine byte selections
839  #endif
840 
841  return r; // byte-wise unsigned maximum
842  }
843 
844  static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
845  {
846  unsigned int r = 0;
847 
848  #if __CUDA_ARCH__ >= 300
849  asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
850  #elif __CUDA_ARCH__ >= 200
851  asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
852  asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
853  asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
854  asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
855  #else
856  unsigned int s;
857  s = vcmpge4(b, a); // mask = 0xff if a >= b
858  r = a & s; // select a when b >= a
859  s = b & ~s; // select b when b < a
860  r = r | s; // combine byte selections
861  #endif
862 
863  return r;
864  }
865 }}}
866 
868 
869 #endif // OPENCV_CUDA_SIMD_FUNCTIONS_HPP
softfloat max(const softfloat &a, const softfloat &b)
Definition: softfloat.hpp:440
softfloat min(const softfloat &a, const softfloat &b)
Min and Max functions.
Definition: softfloat.hpp:437
CvRect r
Definition: imgproc_c.h:984
"black box" representation of the file storage associated with a file on disk.
Definition: calib3d.hpp:441