EstervQrCode 2.0.0
Library for qr code manipulation
Loading...
Searching...
No Matches
vsx_utils.hpp
1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html
4
5#ifndef OPENCV_HAL_VSX_UTILS_HPP
6#define OPENCV_HAL_VSX_UTILS_HPP
7
8#include "opencv2/core/cvdef.h"
9
10#ifndef SKIP_INCLUDES
11# include <assert.h>
12#endif
13
16#if CV_VSX
17
18#define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
19#define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v}
20#define __VSX_S4__(c, v) (c){v, v, v, v}
21#define __VSX_S2__(c, v) (c){v, v}
22
23typedef __vector unsigned char vec_uchar16;
24#define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
25#define vec_uchar16_sp(c) (__VSX_S16__(vec_uchar16, (unsigned char)c))
26#define vec_uchar16_c(v) ((vec_uchar16)(v))
27#define vec_uchar16_z vec_uchar16_sp(0)
28
29typedef __vector signed char vec_char16;
30#define vec_char16_set(...) (vec_char16){__VA_ARGS__}
31#define vec_char16_sp(c) (__VSX_S16__(vec_char16, (signed char)c))
32#define vec_char16_c(v) ((vec_char16)(v))
33#define vec_char16_z vec_char16_sp(0)
34
35typedef __vector unsigned short vec_ushort8;
36#define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
37#define vec_ushort8_sp(c) (__VSX_S8__(vec_ushort8, (unsigned short)c))
38#define vec_ushort8_c(v) ((vec_ushort8)(v))
39#define vec_ushort8_z vec_ushort8_sp(0)
40
41typedef __vector signed short vec_short8;
42#define vec_short8_set(...) (vec_short8){__VA_ARGS__}
43#define vec_short8_sp(c) (__VSX_S8__(vec_short8, (signed short)c))
44#define vec_short8_c(v) ((vec_short8)(v))
45#define vec_short8_z vec_short8_sp(0)
46
47typedef __vector unsigned int vec_uint4;
48#define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
49#define vec_uint4_sp(c) (__VSX_S4__(vec_uint4, (unsigned int)c))
50#define vec_uint4_c(v) ((vec_uint4)(v))
51#define vec_uint4_z vec_uint4_sp(0)
52
53typedef __vector signed int vec_int4;
54#define vec_int4_set(...) (vec_int4){__VA_ARGS__}
55#define vec_int4_sp(c) (__VSX_S4__(vec_int4, (signed int)c))
56#define vec_int4_c(v) ((vec_int4)(v))
57#define vec_int4_z vec_int4_sp(0)
58
59typedef __vector float vec_float4;
60#define vec_float4_set(...) (vec_float4){__VA_ARGS__}
61#define vec_float4_sp(c) (__VSX_S4__(vec_float4, c))
62#define vec_float4_c(v) ((vec_float4)(v))
63#define vec_float4_z vec_float4_sp(0)
64
65typedef __vector unsigned long long vec_udword2;
66#define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
67#define vec_udword2_sp(c) (__VSX_S2__(vec_udword2, (unsigned long long)c))
68#define vec_udword2_c(v) ((vec_udword2)(v))
69#define vec_udword2_z vec_udword2_sp(0)
70
71typedef __vector signed long long vec_dword2;
72#define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
73#define vec_dword2_sp(c) (__VSX_S2__(vec_dword2, (signed long long)c))
74#define vec_dword2_c(v) ((vec_dword2)(v))
75#define vec_dword2_z vec_dword2_sp(0)
76
77typedef __vector double vec_double2;
78#define vec_double2_set(...) (vec_double2){__VA_ARGS__}
79#define vec_double2_c(v) ((vec_double2)(v))
80#define vec_double2_sp(c) (__VSX_S2__(vec_double2, c))
81#define vec_double2_z vec_double2_sp(0)
82
83#define vec_bchar16 __vector __bool char
84#define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
85#define vec_bchar16_c(v) ((vec_bchar16)(v))
86
87#define vec_bshort8 __vector __bool short
88#define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
89#define vec_bshort8_c(v) ((vec_bshort8)(v))
90
91#define vec_bint4 __vector __bool int
92#define vec_bint4_set(...) (vec_bint4){__VA_ARGS__}
93#define vec_bint4_c(v) ((vec_bint4)(v))
94
95#define vec_bdword2 __vector __bool long long
96#define vec_bdword2_set(...) (vec_bdword2){__VA_ARGS__}
97#define vec_bdword2_c(v) ((vec_bdword2)(v))
98
99#define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))
100
101#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
102VSX_FINLINE(rt) fnm(const rg& a) { return fn2(a); }
103
104#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
105VSX_FINLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
106
107/*
108 * GCC VSX compatibility
109**/
110#if defined(__GNUG__) && !defined(__clang__)
111
112// inline asm helper
113#define VSX_IMPL_1RG(rt, rg, opc, fnm) \
114VSX_FINLINE(rt) fnm(const rg& a) \
115{ rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "=wa" (rs) : "wa" (a)); return rs; }
116
117#define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
118VSX_FINLINE(rt) fnm(const rg& a) \
119{ rt rs; __asm__ __volatile__(#opc" %0,%1" : "=v" (rs) : "v" (a)); return rs; }
120
121#define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm) \
122VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
123{ rt rs; __asm__ __volatile__(fopc : "=v" (rs) : "v" (a), "v" (b)); return rs; }
124
125#define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
126
127#if __GNUG__ < 8
128
129 // Support for int4 -> dword2 expanding multiply was added in GCC 8.
130 #ifdef vec_mule
131 #undef vec_mule
132 #endif
133 #ifdef vec_mulo
134 #undef vec_mulo
135 #endif
136
137 VSX_REDIRECT_2RG(vec_ushort8, vec_uchar16, vec_mule, __builtin_vec_mule)
138 VSX_REDIRECT_2RG(vec_short8, vec_char16, vec_mule, __builtin_vec_mule)
139 VSX_REDIRECT_2RG(vec_int4, vec_short8, vec_mule, __builtin_vec_mule)
140 VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mule, __builtin_vec_mule)
141 VSX_REDIRECT_2RG(vec_ushort8, vec_uchar16, vec_mulo, __builtin_vec_mulo)
142 VSX_REDIRECT_2RG(vec_short8, vec_char16, vec_mulo, __builtin_vec_mulo)
143 VSX_REDIRECT_2RG(vec_int4, vec_short8, vec_mulo, __builtin_vec_mulo)
144 VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mulo, __builtin_vec_mulo)
145
146 // dword2 support arrived in ISA 2.07 and GCC 8+
147 VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulosw, vec_mule)
148 VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule)
149 VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulesw, vec_mulo)
150 VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo)
151
152#endif
153
154#if __GNUG__ < 7
155// up to GCC 6 vec_mul only supports precisions and llong
156# ifdef vec_mul
157# undef vec_mul
158# endif
159/*
160 * there's no a direct instruction for supporting 8-bit, 16-bit multiplication in ISA 2.07,
161 * XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
162**/
163# define VSX_IMPL_MULH(Tvec, cperm) \
164 VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
165 { \
166 static const vec_uchar16 ev_od = {cperm}; \
167 return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od); \
168 }
169 #define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
170 VSX_IMPL_MULH(vec_char16, VSX_IMPL_MULH_P16)
171 VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16)
172 #define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
173 VSX_IMPL_MULH(vec_short8, VSX_IMPL_MULH_P8)
174 VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8)
175 // vmuluwm can be used for unsigned or signed integers, that's what they said
176 VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
177 VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
178 // redirect to GCC builtin vec_mul, since it already supports precisions and llong
179 VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul)
180 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
181 VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul)
182 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
183#endif // __GNUG__ < 7
184
185#if __GNUG__ < 6
186/*
187 * Instruction "compare greater than or equal" in ISA 2.07 only supports single
188 * and double precision.
189 * In XLC and new versions of GCC implement integers by using instruction "greater than" and NOR.
190**/
191# ifdef vec_cmpge
192# undef vec_cmpge
193# endif
194# ifdef vec_cmple
195# undef vec_cmple
196# endif
197# define vec_cmple(a, b) vec_cmpge(b, a)
198# define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
199 VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
200
201 VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge)
202 VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
203 VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge)
204 VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
205 VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge)
206 VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge)
207 VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge)
208 VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
209
210// redirect to GCC builtin cmpge, since it already supports precisions
211 VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge)
212 VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
213
214// up to gcc5 vec_nor doesn't support bool long long
215# undef vec_nor
216 template<typename T>
217 VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
218
219 VSX_FINLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
220 { return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
221
222// vec_packs doesn't support double words in gcc4 and old versions of gcc5
223# undef vec_packs
224 VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs)
225 VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
226 VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs)
227 VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs)
228
229 VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs)
230 VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
231#endif // __GNUG__ < 6
232
233#if __GNUG__ < 5
234// vec_xxpermdi in gcc4 missing little-endian supports just like clang
235# define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
236// same as vec_xxpermdi
237# undef vec_vbpermq
238 VSX_IMPL_2VRG(vec_udword2, vec_uchar16, vbpermq, vec_vbpermq)
239 VSX_IMPL_2VRG(vec_dword2, vec_char16, vbpermq, vec_vbpermq)
240#else
241# define vec_permi vec_xxpermdi
242#endif // __GNUG__ < 5
243
244// shift left double by word immediate
245#ifndef vec_sldw
246# define vec_sldw __builtin_vsx_xxsldwi
247#endif
248
249// vector population count
250VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
251VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcntu)
252VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
253VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcntu)
254VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcntu)
255VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcntu)
256VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
257VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcntu)
258
259// converts between single and double-precision
260VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
261VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
262
263// converts word and doubleword to double-precision
264#undef vec_ctd
265VSX_IMPL_1RG(vec_double2, vec_int4, xvcvsxwdp, vec_ctdo)
266VSX_IMPL_1RG(vec_double2, vec_uint4, xvcvuxwdp, vec_ctdo)
267VSX_IMPL_1RG(vec_double2, vec_dword2, xvcvsxddp, vec_ctd)
268VSX_IMPL_1RG(vec_double2, vec_udword2, xvcvuxddp, vec_ctd)
269
270// converts word and doubleword to single-precision
271#undef vec_ctf
272VSX_IMPL_1RG(vec_float4, vec_int4, xvcvsxwsp, vec_ctf)
273VSX_IMPL_1RG(vec_float4, vec_uint4, xvcvuxwsp, vec_ctf)
274VSX_IMPL_1RG(vec_float4, vec_dword2, xvcvsxdsp, vec_ctfo)
275VSX_IMPL_1RG(vec_float4, vec_udword2, xvcvuxdsp, vec_ctfo)
276
277// converts single and double precision to signed word
278#undef vec_cts
279VSX_IMPL_1RG(vec_int4, vec_double2, xvcvdpsxws, vec_ctso)
280VSX_IMPL_1RG(vec_int4, vec_float4, xvcvspsxws, vec_cts)
281
282// converts single and double precision to unsigned word
283#undef vec_ctu
284VSX_IMPL_1RG(vec_uint4, vec_double2, xvcvdpuxws, vec_ctuo)
285VSX_IMPL_1RG(vec_uint4, vec_float4, xvcvspuxws, vec_ctu)
286
287// converts single and double precision to signed doubleword
288#undef vec_ctsl
289VSX_IMPL_1RG(vec_dword2, vec_double2, xvcvdpsxds, vec_ctsl)
290VSX_IMPL_1RG(vec_dword2, vec_float4, xvcvspsxds, vec_ctslo)
291
292// converts single and double precision to unsigned doubleword
293#undef vec_ctul
294VSX_IMPL_1RG(vec_udword2, vec_double2, xvcvdpuxds, vec_ctul)
295VSX_IMPL_1RG(vec_udword2, vec_float4, xvcvspuxds, vec_ctulo)
296
297// just in case if GCC doesn't define it
298#ifndef vec_xl
299# define vec_xl vec_vsx_ld
300# define vec_xst vec_vsx_st
301#endif
302
303#endif // GCC VSX compatibility
304
305/*
306 * CLANG VSX compatibility
307**/
308#if defined(__clang__) && !defined(__IBMCPP__)
309
310/*
311 * CLANG doesn't support %x<n> in the inline asm template which fixes register number
312 * when using any of the register constraints wa, wd, wf
313 *
314 * For more explanation checkout PowerPC and IBM RS6000 in https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
315 * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
316 *
317 * So we're not able to use inline asm and only use built-in functions that CLANG supports
318 * and use __builtin_convertvector if clang missing any of vector conversions built-in functions
319 *
320 * todo: clang asm template bug is fixed, need to reconsider the current workarounds.
321*/
322
323// convert vector helper
324#define VSX_IMPL_CONVERT(rt, rg, fnm) \
325VSX_FINLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
326
327#ifndef vec_permi
328#if __clang_major__ < 5
329// implement vec_permi in a dirty way
330# define VSX_IMPL_CLANG_4_PERMI(Tvec) \
331 VSX_FINLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c) \
332 { \
333 switch (c) \
334 { \
335 case 0: \
336 return vec_mergeh(a, b); \
337 case 1: \
338 return vec_mergel(vec_mergeh(a, a), b); \
339 case 2: \
340 return vec_mergeh(vec_mergel(a, a), b); \
341 default: \
342 return vec_mergel(a, b); \
343 } \
344 }
345 VSX_IMPL_CLANG_4_PERMI(vec_udword2)
346 VSX_IMPL_CLANG_4_PERMI(vec_dword2)
347 VSX_IMPL_CLANG_4_PERMI(vec_double2)
348
349// vec_xxsldwi is missing in clang 4
350# define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
351#else
352// vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4
353# define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
354#endif // __clang_major__ < 5
355#endif
356
357// shift left double by word immediate
358#ifndef vec_sldw
359# define vec_sldw vec_xxsldwi
360#endif
361
362#if __clang_major__ < 13
363// Implement vec_rsqrt since clang only supports vec_rsqrte
364#ifndef vec_rsqrt
365 VSX_FINLINE(vec_float4) vec_rsqrt(const vec_float4& a)
366 { return vec_div(vec_float4_sp(1), vec_sqrt(a)); }
367
368 VSX_FINLINE(vec_double2) vec_rsqrt(const vec_double2& a)
369 { return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
370#endif
371
372// vec_promote missing support for doubleword
373VSX_FINLINE(vec_dword2) vec_promote(long long a, int b)
374{
375 vec_dword2 ret = vec_dword2_z;
376 ret[b & 1] = a;
377 return ret;
378}
379
380VSX_FINLINE(vec_udword2) vec_promote(unsigned long long a, int b)
381{
382 vec_udword2 ret = vec_udword2_z;
383 ret[b & 1] = a;
384 return ret;
385}
386#endif
387
388// vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
389#define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast) \
390VSX_FINLINE(Tvec) vec_popcntu(const Tvec2& a) \
391{ return ucast(vec_popcnt(a)); }
392VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
393VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
394VSX_IMPL_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
395VSX_IMPL_POPCNTU(vec_udword2, vec_dword2, vec_udword2_c);
396// redirect unsigned types
397VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
398VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
399VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
400VSX_REDIRECT_1RG(vec_udword2, vec_udword2, vec_popcntu, vec_popcnt)
401
402// converts between single and double precision
403VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
404VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
405
406// converts word and doubleword to double-precision
407#ifdef vec_ctd
408# undef vec_ctd
409#endif
410VSX_REDIRECT_1RG(vec_double2, vec_int4, vec_ctdo, __builtin_vsx_xvcvsxwdp)
411VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)
412
413VSX_IMPL_CONVERT(vec_double2, vec_dword2, vec_ctd)
414VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)
415
416// converts word and doubleword to single-precision
417#if __clang_major__ > 4
418# undef vec_ctf
419#endif
420VSX_IMPL_CONVERT(vec_float4, vec_int4, vec_ctf)
421VSX_IMPL_CONVERT(vec_float4, vec_uint4, vec_ctf)
422VSX_REDIRECT_1RG(vec_float4, vec_dword2, vec_ctfo, __builtin_vsx_xvcvsxdsp)
423VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctfo, __builtin_vsx_xvcvuxdsp)
424
425// converts single and double precision to signed word
426#if __clang_major__ > 4
427# undef vec_cts
428#endif
429VSX_REDIRECT_1RG(vec_int4, vec_double2, vec_ctso, __builtin_vsx_xvcvdpsxws)
430VSX_IMPL_CONVERT(vec_int4, vec_float4, vec_cts)
431
432// converts single and double precision to unsigned word
433#if __clang_major__ > 4
434# undef vec_ctu
435#endif
436VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctuo, __builtin_vsx_xvcvdpuxws)
437VSX_IMPL_CONVERT(vec_uint4, vec_float4, vec_ctu)
438
439// converts single and double precision to signed doubleword
440#ifdef vec_ctsl
441# undef vec_ctsl
442#endif
443VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
444// __builtin_convertvector unable to convert, xvcvspsxds is missing on it
445VSX_FINLINE(vec_dword2) vec_ctslo(const vec_float4& a)
446{ return vec_ctsl(vec_cvfo(a)); }
447
448// converts single and double precision to unsigned doubleword
449#ifdef vec_ctul
450# undef vec_ctul
451#endif
452VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
453// __builtin_convertvector unable to convert, xvcvspuxds is missing on it
454VSX_FINLINE(vec_udword2) vec_ctulo(const vec_float4& a)
455{ return vec_ctul(vec_cvfo(a)); }
456
457#endif // CLANG VSX compatibility
458
459/*
460 * Common GCC, CLANG compatibility
461**/
462#if defined(__GNUG__) && !defined(__IBMCPP__)
463
464#ifdef vec_cvf
465# undef vec_cvf
466#endif
467
468#define VSX_IMPL_CONV_EVEN_4_2(rt, rg, fnm, fn2) \
469VSX_FINLINE(rt) fnm(const rg& a) \
470{ return fn2(vec_sldw(a, a, 1)); }
471
472VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_float4, vec_cvf, vec_cvfo)
473VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_int4, vec_ctd, vec_ctdo)
474VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_uint4, vec_ctd, vec_ctdo)
475
476VSX_IMPL_CONV_EVEN_4_2(vec_dword2, vec_float4, vec_ctsl, vec_ctslo)
477VSX_IMPL_CONV_EVEN_4_2(vec_udword2, vec_float4, vec_ctul, vec_ctulo)
478
479#define VSX_IMPL_CONV_EVEN_2_4(rt, rg, fnm, fn2) \
480VSX_FINLINE(rt) fnm(const rg& a) \
481{ \
482 rt v4 = fn2(a); \
483 return vec_sldw(v4, v4, 3); \
484}
485
486VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_double2, vec_cvf, vec_cvfo)
487VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_dword2, vec_ctf, vec_ctfo)
488VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_udword2, vec_ctf, vec_ctfo)
489
490VSX_IMPL_CONV_EVEN_2_4(vec_int4, vec_double2, vec_cts, vec_ctso)
491VSX_IMPL_CONV_EVEN_2_4(vec_uint4, vec_double2, vec_ctu, vec_ctuo)
492
493// Only for Eigen!
494/*
495 * changing behavior of conversion intrinsics for gcc has effect on Eigen
496 * so we redefine old behavior again only on gcc, clang
497*/
498#if !defined(__clang__) || __clang_major__ > 4
499 // ignoring second arg since Eigen only truncates toward zero
500# define VSX_IMPL_CONV_2VARIANT(rt, rg, fnm, fn2) \
501 VSX_FINLINE(rt) fnm(const rg& a, int only_truncate) \
502 { \
503 assert(only_truncate == 0); \
504 CV_UNUSED(only_truncate); \
505 return fn2(a); \
506 }
507 VSX_IMPL_CONV_2VARIANT(vec_int4, vec_float4, vec_cts, vec_cts)
508 VSX_IMPL_CONV_2VARIANT(vec_uint4, vec_float4, vec_ctu, vec_ctu)
509 VSX_IMPL_CONV_2VARIANT(vec_float4, vec_int4, vec_ctf, vec_ctf)
510 VSX_IMPL_CONV_2VARIANT(vec_float4, vec_uint4, vec_ctf, vec_ctf)
511 // define vec_cts for converting double precision to signed doubleword
512 // which isn't compatible with xlc but its okay since Eigen only uses it for gcc
513 VSX_IMPL_CONV_2VARIANT(vec_dword2, vec_double2, vec_cts, vec_ctsl)
514#endif // Eigen
515
516#endif // Common GCC, CLANG compatibility
517
518/*
519 * XLC VSX compatibility
520**/
521#if defined(__IBMCPP__)
522
523// vector population count
524#define vec_popcntu vec_popcnt
525
526// overload and redirect with setting second arg to zero
527// since we only support conversions without the second arg
528#define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
529VSX_FINLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
530
531VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4, vec_ctd)
532VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4, vec_ctd)
533VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2, vec_ctd)
534VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)
535
536VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_int4, vec_ctf)
537VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_uint4, vec_ctf)
538VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_dword2, vec_ctf)
539VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_udword2, vec_ctf)
540
541VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_double2, vec_cts)
542VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_float4, vec_cts)
543
544VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_double2, vec_ctu)
545VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_float4, vec_ctu)
546
547VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_double2, vec_ctsl)
548VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_float4, vec_ctsl)
549
550VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
551VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4, vec_ctul)
552
553// fixme: implement conversions of odd-numbered elements in a dirty way
554// since xlc doesn't support VSX registers operand in inline asm.
555#define VSX_IMPL_CONV_ODD_4_2(rt, rg, fnm, fn2) \
556VSX_FINLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
557
558VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_float4, vec_cvfo, vec_cvf)
559VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_int4, vec_ctdo, vec_ctd)
560VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_uint4, vec_ctdo, vec_ctd)
561
562VSX_IMPL_CONV_ODD_4_2(vec_dword2, vec_float4, vec_ctslo, vec_ctsl)
563VSX_IMPL_CONV_ODD_4_2(vec_udword2, vec_float4, vec_ctulo, vec_ctul)
564
565#define VSX_IMPL_CONV_ODD_2_4(rt, rg, fnm, fn2) \
566VSX_FINLINE(rt) fnm(const rg& a) \
567{ \
568 rt v4 = fn2(a); \
569 return vec_sldw(v4, v4, 1); \
570}
571
572VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_double2, vec_cvfo, vec_cvf)
573VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_dword2, vec_ctfo, vec_ctf)
574VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_udword2, vec_ctfo, vec_ctf)
575
576VSX_IMPL_CONV_ODD_2_4(vec_int4, vec_double2, vec_ctso, vec_cts)
577VSX_IMPL_CONV_ODD_2_4(vec_uint4, vec_double2, vec_ctuo, vec_ctu)
578
579#endif // XLC VSX compatibility
580
581// ignore GCC warning that caused by -Wunused-but-set-variable in rare cases
582#if defined(__GNUG__) && !defined(__clang__)
583# define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
584#else // CLANG, XLC
585# define VSX_UNUSED(Tvec) Tvec
586#endif
587
588// gcc can find his way in casting log int and XLC, CLANG ambiguous
589#if defined(__clang__) || defined(__IBMCPP__)
590 VSX_FINLINE(vec_udword2) vec_splats(uint64 v)
591 { return vec_splats((unsigned long long) v); }
592
593 VSX_FINLINE(vec_dword2) vec_splats(int64 v)
594 { return vec_splats((long long) v); }
595
596 VSX_FINLINE(vec_udword2) vec_promote(uint64 a, int b)
597 { return vec_promote((unsigned long long) a, b); }
598
599 VSX_FINLINE(vec_dword2) vec_promote(int64 a, int b)
600 { return vec_promote((long long) a, b); }
601#endif
602
603/*
604 * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
605 * load and set using offset depend on the pointer type
606 *
607 * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer)
608 * load and set using offset depend on fixed bytes size
609 *
610 * Note: In clang vec_xl and vec_xst fails to load unaligned addresses
611 * so we are using vec_vsx_ld, vec_vsx_st instead
612*/
613
614#if defined(__clang__) && !defined(__IBMCPP__)
615# define vsx_ldf vec_vsx_ld
616# define vsx_stf vec_vsx_st
617#else // GCC , XLC
618# define vsx_ldf vec_xl
619# define vsx_stf vec_xst
620#endif
621
622#define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
623#define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
624#define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)
625
626/*
627 * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load and store double words
628 * In GCC vec_xl and vec_xst it maps to vec_vsx_ld, vec_vsx_st which doesn't support long long
629 * and in CLANG we are using vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned addresses
630 *
631 * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long
632*/
633#if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
634 VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
635 { return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int*)p)); }
636
637 VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
638 { return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int*)p)); }
639
640 VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
641 { vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int*)p); }
642
643 VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
644 { vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int*)p); }
645#else // XLC
646 VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
647 { return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long*)p); }
648
649 VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
650 { return vsx_ldf(VSX_OFFSET(o, p), (long long*)p); }
651
652 VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
653 { vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long*)p); }
654
655 VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
656 { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
657#endif
658
659// Store lower 8 byte
660#define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
661
662// Store higher 8 byte
663#define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
664
665// Load 64-bits of integer data to lower part
666#define VSX_IMPL_LOAD_L8(Tvec, Tp) \
667VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p) \
668{ return ((Tvec)vec_promote(*((uint64*)p), 0)); }
669
670VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
671VSX_IMPL_LOAD_L8(vec_char16, schar)
672VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
673VSX_IMPL_LOAD_L8(vec_short8, short)
674VSX_IMPL_LOAD_L8(vec_uint4, uint)
675VSX_IMPL_LOAD_L8(vec_int4, int)
676VSX_IMPL_LOAD_L8(vec_float4, float)
677VSX_IMPL_LOAD_L8(vec_udword2, uint64)
678VSX_IMPL_LOAD_L8(vec_dword2, int64)
679VSX_IMPL_LOAD_L8(vec_double2, double)
680
681// logical not
682#define vec_not(a) vec_nor(a, a)
683
684// power9 yaya
685// not equal
686#ifndef vec_cmpne
687# define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
688#endif
689
690// absolute difference
691#ifndef _ARCH_PWR9
692# undef vec_absd
693# define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
694#endif
695
696/*
697 * Implement vec_unpacklu and vec_unpackhu
698 * since vec_unpackl, vec_unpackh only support signed integers
699**/
700#define VSX_IMPL_UNPACKU(rt, rg, zero) \
701VSX_FINLINE(rt) vec_unpacklu(const rg& a) \
702{ return (rt)(vec_mergel(a, zero)); } \
703VSX_FINLINE(rt) vec_unpackhu(const rg& a) \
704{ return (rt)(vec_mergeh(a, zero)); }
705
706VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
707VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z)
708VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z)
709
710/*
711 * Implement vec_mergesqe and vec_mergesqo
712 * Merges the sequence values of even and odd elements of two vectors
713*/
714#define VSX_IMPL_PERM(rt, fnm, ...) \
715VSX_FINLINE(rt) fnm(const rt& a, const rt& b) \
716{ static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
717
718// 16
719#define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
720#define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
721VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
722VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
723VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe)
724VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo)
725// 8
726#define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
727#define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
728VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
729VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
730VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe)
731VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo)
732// 4
733#define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
734#define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
735VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe)
736VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo)
737VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe)
738VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo)
739VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
740VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
741// 2
742VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
743VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
744VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh)
745VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel)
746VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
747VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
748
749/*
750 * Implement vec_mergesqh and vec_mergesql
751 * Merges the sequence most and least significant halves of two vectors
752*/
753#define VSX_IMPL_MERGESQHL(Tvec) \
754VSX_FINLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b) \
755{ return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); } \
756VSX_FINLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b) \
757{ return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); }
758VSX_IMPL_MERGESQHL(vec_uchar16)
759VSX_IMPL_MERGESQHL(vec_char16)
760VSX_IMPL_MERGESQHL(vec_ushort8)
761VSX_IMPL_MERGESQHL(vec_short8)
762VSX_IMPL_MERGESQHL(vec_uint4)
763VSX_IMPL_MERGESQHL(vec_int4)
764VSX_IMPL_MERGESQHL(vec_float4)
765VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
766VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
767VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh)
768VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel)
769VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
770VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
771
772
773// 2 and 4 channels interleave for all types except 2 lanes
774#define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec) \
775VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
776{ \
777 vsx_stf(vec_mergeh(a, b), 0, ptr); \
778 vsx_stf(vec_mergel(a, b), 16, ptr); \
779} \
780VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
781 const Tvec& c, const Tvec& d, Tp* ptr) \
782{ \
783 Tvec ac = vec_mergeh(a, c); \
784 Tvec bd = vec_mergeh(b, d); \
785 vsx_stf(vec_mergeh(ac, bd), 0, ptr); \
786 vsx_stf(vec_mergel(ac, bd), 16, ptr); \
787 ac = vec_mergel(a, c); \
788 bd = vec_mergel(b, d); \
789 vsx_stf(vec_mergeh(ac, bd), 32, ptr); \
790 vsx_stf(vec_mergel(ac, bd), 48, ptr); \
791}
792VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16)
793VSX_IMPL_ST_INTERLEAVE(schar, vec_char16)
794VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
795VSX_IMPL_ST_INTERLEAVE(short, vec_short8)
796VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4)
797VSX_IMPL_ST_INTERLEAVE(int, vec_int4)
798VSX_IMPL_ST_INTERLEAVE(float, vec_float4)
799
800// 2 and 4 channels deinterleave for 16 lanes
801#define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec) \
802VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
803{ \
804 Tvec v0 = vsx_ld(0, ptr); \
805 Tvec v1 = vsx_ld(16, ptr); \
806 a = vec_mergesqe(v0, v1); \
807 b = vec_mergesqo(v0, v1); \
808} \
809VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
810 Tvec& c, Tvec& d) \
811{ \
812 Tvec v0 = vsx_ld(0, ptr); \
813 Tvec v1 = vsx_ld(16, ptr); \
814 Tvec v2 = vsx_ld(32, ptr); \
815 Tvec v3 = vsx_ld(48, ptr); \
816 Tvec m0 = vec_mergesqe(v0, v1); \
817 Tvec m1 = vec_mergesqe(v2, v3); \
818 a = vec_mergesqe(m0, m1); \
819 c = vec_mergesqo(m0, m1); \
820 m0 = vec_mergesqo(v0, v1); \
821 m1 = vec_mergesqo(v2, v3); \
822 b = vec_mergesqe(m0, m1); \
823 d = vec_mergesqo(m0, m1); \
824}
825VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
826VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)
827
828// 2 and 4 channels deinterleave for 8 lanes
829#define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec) \
830VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
831{ \
832 Tvec v0 = vsx_ld(0, ptr); \
833 Tvec v1 = vsx_ld(8, ptr); \
834 a = vec_mergesqe(v0, v1); \
835 b = vec_mergesqo(v0, v1); \
836} \
837VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
838 Tvec& c, Tvec& d) \
839{ \
840 Tvec v0 = vsx_ld(0, ptr); \
841 Tvec v1 = vsx_ld(8, ptr); \
842 Tvec m0 = vec_mergeh(v0, v1); \
843 Tvec m1 = vec_mergel(v0, v1); \
844 Tvec ab0 = vec_mergeh(m0, m1); \
845 Tvec cd0 = vec_mergel(m0, m1); \
846 v0 = vsx_ld(16, ptr); \
847 v1 = vsx_ld(24, ptr); \
848 m0 = vec_mergeh(v0, v1); \
849 m1 = vec_mergel(v0, v1); \
850 Tvec ab1 = vec_mergeh(m0, m1); \
851 Tvec cd1 = vec_mergel(m0, m1); \
852 a = vec_mergesqh(ab0, ab1); \
853 b = vec_mergesql(ab0, ab1); \
854 c = vec_mergesqh(cd0, cd1); \
855 d = vec_mergesql(cd0, cd1); \
856}
857VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
858VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8)
859
860// 2 and 4 channels deinterleave for 4 lanes
861#define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec) \
862VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
863{ \
864 a = vsx_ld(0, ptr); \
865 b = vsx_ld(4, ptr); \
866 Tvec m0 = vec_mergeh(a, b); \
867 Tvec m1 = vec_mergel(a, b); \
868 a = vec_mergeh(m0, m1); \
869 b = vec_mergel(m0, m1); \
870} \
871VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
872 Tvec& c, Tvec& d) \
873{ \
874 Tvec v0 = vsx_ld(0, ptr); \
875 Tvec v1 = vsx_ld(4, ptr); \
876 Tvec v2 = vsx_ld(8, ptr); \
877 Tvec v3 = vsx_ld(12, ptr); \
878 Tvec m0 = vec_mergeh(v0, v2); \
879 Tvec m1 = vec_mergeh(v1, v3); \
880 a = vec_mergeh(m0, m1); \
881 b = vec_mergel(m0, m1); \
882 m0 = vec_mergel(v0, v2); \
883 m1 = vec_mergel(v1, v3); \
884 c = vec_mergeh(m0, m1); \
885 d = vec_mergel(m0, m1); \
886}
887VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4)
888VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4)
889VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)
890
891// 2 and 4 channels interleave and deinterleave for 2 lanes
892#define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func) \
893VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
894{ \
895 st_func(vec_mergeh(a, b), 0, ptr); \
896 st_func(vec_mergel(a, b), 2, ptr); \
897} \
898VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
899 const Tvec& c, const Tvec& d, Tp* ptr) \
900{ \
901 st_func(vec_mergeh(a, b), 0, ptr); \
902 st_func(vec_mergeh(c, d), 2, ptr); \
903 st_func(vec_mergel(a, b), 4, ptr); \
904 st_func(vec_mergel(c, d), 6, ptr); \
905} \
906VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
907{ \
908 Tvec m0 = ld_func(0, ptr); \
909 Tvec m1 = ld_func(2, ptr); \
910 a = vec_mergeh(m0, m1); \
911 b = vec_mergel(m0, m1); \
912} \
913VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
914 Tvec& c, Tvec& d) \
915{ \
916 Tvec v0 = ld_func(0, ptr); \
917 Tvec v1 = ld_func(2, ptr); \
918 Tvec v2 = ld_func(4, ptr); \
919 Tvec v3 = ld_func(6, ptr); \
920 a = vec_mergeh(v0, v2); \
921 b = vec_mergel(v0, v2); \
922 c = vec_mergeh(v1, v3); \
923 d = vec_mergel(v1, v3); \
924}
925VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2)
926VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
927VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st)
928
929/* 3 channels */
930#define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec) \
931VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
932 const Tvec& c, Tp* ptr) \
933{ \
934 static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5}; \
935 static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15}; \
936 vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
937 static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26}; \
938 static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15}; \
939 vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr); \
940 static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0}; \
941 static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31}; \
942 vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr); \
943} \
944VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
945{ \
946 Tvec v1 = vsx_ld(0, ptr); \
947 Tvec v2 = vsx_ld(16, ptr); \
948 Tvec v3 = vsx_ld(32, ptr); \
949 static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0}; \
950 static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}; \
951 a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
952 static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \
953 static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}; \
954 b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
955 static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0}; \
956 static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31}; \
957 c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
958}
959VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
960VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)
961
962#define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec) \
963VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
964 const Tvec& c, Tp* ptr) \
965{ \
966 static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21}; \
967 static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15}; \
968 vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
969 static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11}; \
970 static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15}; \
971 vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr); \
972 static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0}; \
973 static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31}; \
974 vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr); \
975} \
976VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
977{ \
978 Tvec v1 = vsx_ld(0, ptr); \
979 Tvec v2 = vsx_ld(8, ptr); \
980 Tvec v3 = vsx_ld(16, ptr); \
981 static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \
982 static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27}; \
983 a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
984 static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0}; \
985 static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29}; \
986 b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
987 static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \
988 static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31}; \
989 c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
990}
991VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
992VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8)
993
994#define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec) \
995VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
996 const Tvec& c, Tp* ptr) \
997{ \
998 Tvec hbc = vec_mergeh(b, c); \
999 static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7}; \
1000 vsx_st(vec_perm(a, hbc, ahbc), 0, ptr); \
1001 Tvec lab = vec_mergel(a, b); \
1002 vsx_st(vec_sld(lab, hbc, 8), 4, ptr); \
1003 static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\
1004 vsx_st(vec_perm(c, lab, clab), 8, ptr); \
1005} \
1006VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
1007{ \
1008 Tvec v1 = vsx_ld(0, ptr); \
1009 Tvec v2 = vsx_ld(4, ptr); \
1010 Tvec v3 = vsx_ld(8, ptr); \
1011 static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31}; \
1012 a = vec_perm(v1, vec_sld(v3, v2, 8), flp); \
1013 static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19}; \
1014 b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); \
1015 c = vec_perm(vec_sld(v2, v1, 8), v3, flp); \
1016}
1017VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4)
1018VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4)
1019VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)
1020
1021#define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func) \
1022VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
1023 const Tvec& c, Tp* ptr) \
1024{ \
1025 st_func(vec_mergeh(a, b), 0, ptr); \
1026 st_func(vec_permi(c, a, 1), 2, ptr); \
1027 st_func(vec_mergel(b, c), 4, ptr); \
1028} \
1029VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, \
1030 Tvec& b, Tvec& c) \
1031{ \
1032 Tvec v1 = ld_func(0, ptr); \
1033 Tvec v2 = ld_func(2, ptr); \
1034 Tvec v3 = ld_func(4, ptr); \
1035 a = vec_permi(v1, v2, 1); \
1036 b = vec_permi(v1, v3, 2); \
1037 c = vec_permi(v2, v3, 1); \
1038}
1039VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2)
1040VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
1041VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st)
1042
1043#endif // CV_VSX
1044
1046
1047#endif // OPENCV_HAL_VSX_UTILS_HPP
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62