EstervQrCode 1.1.1
Library for qr code manipulation
msa_macros.h
1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #ifndef OPENCV_CORE_HAL_MSA_MACROS_H
6 #define OPENCV_CORE_HAL_MSA_MACROS_H
7 
8 #ifdef __mips_msa
9 #include "msa.h"
10 #include <stdint.h>
11 
12 #ifdef __cplusplus
13 extern "C" {
14 #endif
15 
16 /* Define 64 bits vector types */
17 typedef signed char v8i8 __attribute__ ((vector_size(8), aligned(8)));
18 typedef unsigned char v8u8 __attribute__ ((vector_size(8), aligned(8)));
19 typedef short v4i16 __attribute__ ((vector_size(8), aligned(8)));
20 typedef unsigned short v4u16 __attribute__ ((vector_size(8), aligned(8)));
21 typedef int v2i32 __attribute__ ((vector_size(8), aligned(8)));
22 typedef unsigned int v2u32 __attribute__ ((vector_size(8), aligned(8)));
23 typedef long long v1i64 __attribute__ ((vector_size(8), aligned(8)));
24 typedef unsigned long long v1u64 __attribute__ ((vector_size(8), aligned(8)));
25 typedef float v2f32 __attribute__ ((vector_size(8), aligned(8)));
26 typedef double v1f64 __attribute__ ((vector_size(8), aligned(8)));
27 
28 
29 /* Load values from the given memory a 64-bit vector. */
30 #define msa_ld1_s8(__a) (*((v8i8*)(__a)))
31 #define msa_ld1_s16(__a) (*((v4i16*)(__a)))
32 #define msa_ld1_s32(__a) (*((v2i32*)(__a)))
33 #define msa_ld1_s64(__a) (*((v1i64*)(__a)))
34 #define msa_ld1_u8(__a) (*((v8u8*)(__a)))
35 #define msa_ld1_u16(__a) (*((v4u16*)(__a)))
36 #define msa_ld1_u32(__a) (*((v2u32*)(__a)))
37 #define msa_ld1_u64(__a) (*((v1u64*)(__a)))
38 #define msa_ld1_f32(__a) (*((v2f32*)(__a)))
39 #define msa_ld1_f64(__a) (*((v1f64*)(__a)))
40 
41 /* Load values from the given memory address to a 128-bit vector */
42 #define msa_ld1q_s8(__a) ((v16i8)__builtin_msa_ld_b(__a, 0))
43 #define msa_ld1q_s16(__a) ((v8i16)__builtin_msa_ld_h(__a, 0))
44 #define msa_ld1q_s32(__a) ((v4i32)__builtin_msa_ld_w(__a, 0))
45 #define msa_ld1q_s64(__a) ((v2i64)__builtin_msa_ld_d(__a, 0))
46 #define msa_ld1q_u8(__a) ((v16u8)__builtin_msa_ld_b(__a, 0))
47 #define msa_ld1q_u16(__a) ((v8u16)__builtin_msa_ld_h(__a, 0))
48 #define msa_ld1q_u32(__a) ((v4u32)__builtin_msa_ld_w(__a, 0))
49 #define msa_ld1q_u64(__a) ((v2u64)__builtin_msa_ld_d(__a, 0))
50 #define msa_ld1q_f32(__a) ((v4f32)__builtin_msa_ld_w(__a, 0))
51 #define msa_ld1q_f64(__a) ((v2f64)__builtin_msa_ld_d(__a, 0))
52 
53 /* Store 64bits vector elements values to the given memory address. */
54 #define msa_st1_s8(__a, __b) (*((v8i8*)(__a)) = __b)
55 #define msa_st1_s16(__a, __b) (*((v4i16*)(__a)) = __b)
56 #define msa_st1_s32(__a, __b) (*((v2i32*)(__a)) = __b)
57 #define msa_st1_s64(__a, __b) (*((v1i64*)(__a)) = __b)
58 #define msa_st1_u8(__a, __b) (*((v8u8*)(__a)) = __b)
59 #define msa_st1_u16(__a, __b) (*((v4u16*)(__a)) = __b)
60 #define msa_st1_u32(__a, __b) (*((v2u32*)(__a)) = __b)
61 #define msa_st1_u64(__a, __b) (*((v1u64*)(__a)) = __b)
62 #define msa_st1_f32(__a, __b) (*((v2f32*)(__a)) = __b)
63 #define msa_st1_f64(__a, __b) (*((v1f64*)(__a)) = __b)
64 
65 /* Store the values of elements in the 128 bits vector __a to the given memory address __a. */
66 #define msa_st1q_s8(__a, __b) (__builtin_msa_st_b((v16i8)(__b), __a, 0))
67 #define msa_st1q_s16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
68 #define msa_st1q_s32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
69 #define msa_st1q_s64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
70 #define msa_st1q_u8(__a, __b) (__builtin_msa_st_b((v16i8)(__b), __a, 0))
71 #define msa_st1q_u16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
72 #define msa_st1q_u32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
73 #define msa_st1q_u64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
74 #define msa_st1q_f32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
75 #define msa_st1q_f64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
76 
77 /* Store the value of the element with the index __c in vector __a to the given memory address __a. */
78 #define msa_st1_lane_s8(__a, __b, __c) (*((int8_t*)(__a)) = __b[__c])
79 #define msa_st1_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = __b[__c])
80 #define msa_st1_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __b[__c])
81 #define msa_st1_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __b[__c])
82 #define msa_st1_lane_u8(__a, __b, __c) (*((uint8_t*)(__a)) = __b[__c])
83 #define msa_st1_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = __b[__c])
84 #define msa_st1_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __b[__c])
85 #define msa_st1_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __b[__c])
86 #define msa_st1_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c])
87 #define msa_st1_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c])
88 #define msa_st1q_lane_s8(__a, __b, __c) (*((int8_t*)(__a)) = (int8_t)__builtin_msa_copy_s_b(__b, __c))
89 #define msa_st1q_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = (int16_t)__builtin_msa_copy_s_h(__b, __c))
90 #define msa_st1q_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __builtin_msa_copy_s_w(__b, __c))
91 #define msa_st1q_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __builtin_msa_copy_s_d(__b, __c))
92 #define msa_st1q_lane_u8(__a, __b, __c) (*((uint8_t*)(__a)) = (uint8_t)__builtin_msa_copy_u_b((v16i8)(__b), __c))
93 #define msa_st1q_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = (uint16_t)__builtin_msa_copy_u_h((v8i16)(__b), __c))
94 #define msa_st1q_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __builtin_msa_copy_u_w((v4i32)(__b), __c))
95 #define msa_st1q_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __builtin_msa_copy_u_d((v2i64)(__b), __c))
96 #define msa_st1q_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c])
97 #define msa_st1q_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c])
98 
99 /* Duplicate elements for 64-bit doubleword vectors */
100 #define msa_dup_n_s8(__a) ((v8i8)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
101 #define msa_dup_n_s16(__a) ((v4i16)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
102 #define msa_dup_n_s32(__a) ((v2i32){__a, __a})
103 #define msa_dup_n_s64(__a) ((v1i64){__a})
104 #define msa_dup_n_u8(__a) ((v8u8)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
105 #define msa_dup_n_u16(__a) ((v4u16)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
106 #define msa_dup_n_u32(__a) ((v2u32){__a, __a})
107 #define msa_dup_n_u64(__a) ((v1u64){__a})
108 #define msa_dup_n_f32(__a) ((v2f32){__a, __a})
109 #define msa_dup_n_f64(__a) ((v1f64){__a})
110 
111 /* Duplicate elements for 128-bit quadword vectors */
112 #define msa_dupq_n_s8(__a) (__builtin_msa_fill_b((int32_t)(__a)))
113 #define msa_dupq_n_s16(__a) (__builtin_msa_fill_h((int32_t)(__a)))
114 #define msa_dupq_n_s32(__a) (__builtin_msa_fill_w((int32_t)(__a)))
115 #define msa_dupq_n_s64(__a) (__builtin_msa_fill_d((int64_t)(__a)))
116 #define msa_dupq_n_u8(__a) ((v16u8)__builtin_msa_fill_b((int32_t)(__a)))
117 #define msa_dupq_n_u16(__a) ((v8u16)__builtin_msa_fill_h((int32_t)(__a)))
118 #define msa_dupq_n_u32(__a) ((v4u32)__builtin_msa_fill_w((int32_t)(__a)))
119 #define msa_dupq_n_u64(__a) ((v2u64)__builtin_msa_fill_d((int64_t)(__a)))
120 #define msa_dupq_n_f32(__a) ((v4f32){__a, __a, __a, __a})
121 #define msa_dupq_n_f64(__a) ((v2f64){__a, __a})
122 #define msa_dupq_lane_s8(__a, __b) (__builtin_msa_splat_b(__a, __b))
123 #define msa_dupq_lane_s16(__a, __b) (__builtin_msa_splat_h(__a, __b))
124 #define msa_dupq_lane_s32(__a, __b) (__builtin_msa_splat_w(__a, __b))
125 #define msa_dupq_lane_s64(__a, __b) (__builtin_msa_splat_d(__a, __b))
126 #define msa_dupq_lane_u8(__a, __b) ((v16u8)__builtin_msa_splat_b((v16i8)(__a), __b))
127 #define msa_dupq_lane_u16(__a, __b) ((v8u16)__builtin_msa_splat_h((v8i16)(__a), __b))
128 #define msa_dupq_lane_u32(__a, __b) ((v4u32)__builtin_msa_splat_w((v4i32)(__a), __b))
129 #define msa_dupq_lane_u64(__a, __b) ((v2u64)__builtin_msa_splat_d((v2i64)(__a), __b))
130 
131 /* Create a 64 bits vector */
132 #define msa_create_s8(__a) ((v8i8)((uint64_t)(__a)))
133 #define msa_create_s16(__a) ((v4i16)((uint64_t)(__a)))
134 #define msa_create_s32(__a) ((v2i32)((uint64_t)(__a)))
135 #define msa_create_s64(__a) ((v1i64)((uint64_t)(__a)))
136 #define msa_create_u8(__a) ((v8u8)((uint64_t)(__a)))
137 #define msa_create_u16(__a) ((v4u16)((uint64_t)(__a)))
138 #define msa_create_u32(__a) ((v2u32)((uint64_t)(__a)))
139 #define msa_create_u64(__a) ((v1u64)((uint64_t)(__a)))
140 #define msa_create_f32(__a) ((v2f32)((uint64_t)(__a)))
141 #define msa_create_f64(__a) ((v1f64)((uint64_t)(__a)))
142 
143 /* Sign extends or zero extends each element in a 64 bits vector to twice its original length, and places the results in a 128 bits vector. */
144 /*Transform v8i8 to v8i16*/
145 #define msa_movl_s8(__a) \
146 ((v8i16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
147  (__a)[4], (__a)[5], (__a)[6], (__a)[7]})
148 
149 /*Transform v8u8 to v8u16*/
150 #define msa_movl_u8(__a) \
151 ((v8u16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
152  (__a)[4], (__a)[5], (__a)[6], (__a)[7]})
153 
154 /*Transform v4i16 to v8i16*/
155 #define msa_movl_s16(__a) ((v4i32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
156 
157 /*Transform v2i32 to v4i32*/
158 #define msa_movl_s32(__a) ((v2i64){(__a)[0], (__a)[1]})
159 
160 /*Transform v4u16 to v8u16*/
161 #define msa_movl_u16(__a) ((v4u32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
162 
163 /*Transform v2u32 to v4u32*/
164 #define msa_movl_u32(__a) ((v2u64){(__a)[0], (__a)[1]})
165 
166 /* Copies the least significant half of each element of a 128 bits vector into the corresponding elements of a 64 bits vector. */
167 #define msa_movn_s16(__a) \
168 ({ \
169  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
170  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
171 })
172 
173 #define msa_movn_s32(__a) \
174 ({ \
175  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
176  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
177 })
178 
179 #define msa_movn_s64(__a) \
180 ({ \
181  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
182  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
183 })
184 
185 #define msa_movn_u16(__a) \
186 ({ \
187  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
188  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
189 })
190 
191 #define msa_movn_u32(__a) \
192 ({ \
193  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
194  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
195 })
196 
197 #define msa_movn_u64(__a) \
198 ({ \
199  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
200  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
201 })
202 
203 /* qmovn */
204 #define msa_qmovn_s16(__a) \
205 ({ \
206  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)); \
207  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
208 })
209 
210 #define msa_qmovn_s32(__a) \
211 ({ \
212  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)); \
213  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
214 })
215 
216 #define msa_qmovn_s64(__a) \
217 ({ \
218  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)); \
219  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
220 })
221 
222 #define msa_qmovn_u16(__a) \
223 ({ \
224  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)); \
225  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
226 })
227 
228 #define msa_qmovn_u32(__a) \
229 ({ \
230  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)); \
231  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
232 })
233 
234 #define msa_qmovn_u64(__a) \
235 ({ \
236  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)); \
237  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
238 })
239 
240 /* qmovun */
241 #define msa_qmovun_s16(__a) \
242 ({ \
243  v8i16 __d = __builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
244  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
245  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
246 })
247 
248 #define msa_qmovun_s32(__a) \
249 ({ \
250  v4i32 __d = __builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
251  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
252  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
253 })
254 
255 #define msa_qmovun_s64(__a) \
256 ({ \
257  v2i64 __d = __builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)); \
258  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
259  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
260 })
261 
262 /* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */
263 #define msa_shrn_n_s16(__a, __b) \
264 ({ \
265  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__b))); \
266  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
267 })
268 
269 #define msa_shrn_n_s32(__a, __b) \
270 ({ \
271  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__b))); \
272  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
273 })
274 
275 #define msa_shrn_n_s64(__a, __b) \
276 ({ \
277  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__b))); \
278  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
279 })
280 
281 #define msa_shrn_n_u16(__a, __b) \
282 ({ \
283  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__b))); \
284  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
285 })
286 
287 #define msa_shrn_n_u32(__a, __b) \
288 ({ \
289  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__b))); \
290  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
291 })
292 
293 #define msa_shrn_n_u64(__a, __b) \
294 ({ \
295  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__b))); \
296  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
297 })
298 
299 /* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */
300 #define msa_rshrn_n_s16(__a, __b) \
301 ({ \
302  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)__b)); \
303  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
304 })
305 
306 #define msa_rshrn_n_s32(__a, __b) \
307 ({ \
308  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)__b)); \
309  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
310 })
311 
312 #define msa_rshrn_n_s64(__a, __b) \
313 ({ \
314  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)__b)); \
315  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
316 })
317 
318 #define msa_rshrn_n_u16(__a, __b) \
319 ({ \
320  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)__b)); \
321  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
322 })
323 
324 #define msa_rshrn_n_u32(__a, __b) \
325 ({ \
326  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)__b)); \
327  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
328 })
329 
330 #define msa_rshrn_n_u64(__a, __b) \
331 ({ \
332  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)__b)); \
333  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
334 })
335 
336 /* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector. */
337 #define msa_qrshrn_n_s16(__a, __b) \
338 ({ \
339  v8i16 __d = __builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__b)), 7); \
340  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
341  (v8i8)__builtin_msa_copy_s_d((v2i64)__e, 0); \
342 })
343 
344 #define msa_qrshrn_n_s32(__a, __b) \
345 ({ \
346  v4i32 __d = __builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__b)), 15); \
347  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
348  (v4i16)__builtin_msa_copy_s_d((v2i64)__e, 0); \
349 })
350 
351 #define msa_qrshrn_n_s64(__a, __b) \
352 ({ \
353  v2i64 __d = __builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__b)), 31); \
354  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
355  (v2i32)__builtin_msa_copy_s_d((v2i64)__e, 0); \
356 })
357 
358 #define msa_qrshrn_n_u16(__a, __b) \
359 ({ \
360  v8u16 __d = __builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__b)), 7); \
361  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
362  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
363 })
364 
365 #define msa_qrshrn_n_u32(__a, __b) \
366 ({ \
367  v4u32 __d = __builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__b)), 15); \
368  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
369  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
370 })
371 
372 #define msa_qrshrn_n_u64(__a, __b) \
373 ({ \
374  v2u64 __d = __builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__b)), 31); \
375  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
376  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
377 })
378 
379 /* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector.
380  Input is signed and output is unsigned. */
381 #define msa_qrshrun_n_s16(__a, __b) \
382 ({ \
383  v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__b)); \
384  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
385  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
386 })
387 
388 #define msa_qrshrun_n_s32(__a, __b) \
389 ({ \
390  v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__b)); \
391  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
392  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
393 })
394 
395 #define msa_qrshrun_n_s64(__a, __b) \
396 ({ \
397  v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__b)); \
398  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
399  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
400 })
401 
402 /* pack */
403 #define msa_pack_s16(__a, __b) (__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
404 #define msa_pack_s32(__a, __b) (__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
405 #define msa_pack_s64(__a, __b) (__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
406 #define msa_pack_u16(__a, __b) ((v16u8)__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
407 #define msa_pack_u32(__a, __b) ((v8u16)__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
408 #define msa_pack_u64(__a, __b) ((v4u32)__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
409 
410 /* qpack */
411 #define msa_qpack_s16(__a, __b) \
412 (__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h((v8i16)(__b), 7), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)))
413 #define msa_qpack_s32(__a, __b) \
414 (__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w((v4i32)(__b), 15), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)))
415 #define msa_qpack_s64(__a, __b) \
416 (__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d((v2i64)(__b), 31), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)))
417 #define msa_qpack_u16(__a, __b) \
418 ((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__b), 7), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)))
419 #define msa_qpack_u32(__a, __b) \
420 ((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__b), 15), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)))
421 #define msa_qpack_u64(__a, __b) \
422 ((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__b), 31), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)))
423 
424 /* qpacku */
425 #define msa_qpacku_s16(__a, __b) \
426 ((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b))), 7), \
427  (v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a))), 7)))
428 #define msa_qpacku_s32(__a, __b) \
429 ((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b))), 15), \
430  (v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a))), 15)))
431 #define msa_qpacku_s64(__a, __b) \
432 ((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b))), 31), \
433  (v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a))), 31)))
434 
435 /* packr */
436 #define msa_packr_s16(__a, __b, __c) \
437 (__builtin_msa_pckev_b((v16i8)__builtin_msa_srai_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__c))))
438 #define msa_packr_s32(__a, __b, __c) \
439 (__builtin_msa_pckev_h((v8i16)__builtin_msa_srai_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__c))))
440 #define msa_packr_s64(__a, __b, __c) \
441 (__builtin_msa_pckev_w((v4i32)__builtin_msa_srai_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__c))))
442 #define msa_packr_u16(__a, __b, __c) \
443 ((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srli_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__c))))
444 #define msa_packr_u32(__a, __b, __c) \
445 ((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srli_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__c))))
446 #define msa_packr_u64(__a, __b, __c) \
447 ((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srli_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__c))))
448 
449 /* rpackr */
450 #define msa_rpackr_s16(__a, __b, __c) \
451 (__builtin_msa_pckev_b((v16i8)__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)(__c))))
452 #define msa_rpackr_s32(__a, __b, __c) \
453 (__builtin_msa_pckev_h((v8i16)__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)(__c))))
454 #define msa_rpackr_s64(__a, __b, __c) \
455 (__builtin_msa_pckev_w((v4i32)__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)(__c))))
456 #define msa_rpackr_u16(__a, __b, __c) \
457 ((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c))))
458 #define msa_rpackr_u32(__a, __b, __c) \
459 ((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c))))
460 #define msa_rpackr_u64(__a, __b, __c) \
461 ((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c))))
462 
463 /* qrpackr */
464 #define msa_qrpackr_s16(__a, __b, __c) \
465 (__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), 7), \
466  (v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__c)), 7)))
467 #define msa_qrpackr_s32(__a, __b, __c) \
468 (__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), 15), \
469  (v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__c)), 15)))
470 #define msa_qrpackr_s64(__a, __b, __c) \
471 (__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), 31), \
472  (v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__c)), 31)))
473 #define msa_qrpackr_u16(__a, __b, __c) \
474 ((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), 7), \
475  (v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c)), 7)))
476 #define msa_qrpackr_u32(__a, __b, __c) \
477 ((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), 15), \
478  (v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c)), 15)))
479 #define msa_qrpackr_u64(__a, __b, __c) \
480 ((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), 31), \
481  (v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c)), 31)))
482 
483 /* qrpackru */
484 #define msa_qrpackru_s16(__a, __b, __c) \
485 ({ \
486  v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__c)); \
487  v8i16 __e = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b)), (int)(__c)); \
488  (v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__e, 7), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
489 })
490 
491 #define msa_qrpackru_s32(__a, __b, __c) \
492 ({ \
493  v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__c)); \
494  v4i32 __e = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b)), (int)(__c)); \
495  (v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__e, 15), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
496 })
497 
498 #define msa_qrpackru_s64(__a, __b, __c) \
499 ({ \
500  v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__c)); \
501  v2i64 __e = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b)), (int)(__c)); \
502  (v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__e, 31), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
503 })
504 
505 /* Minimum values between corresponding elements in the two vectors are written to the returned vector. */
506 #define msa_minq_s8(__a, __b) (__builtin_msa_min_s_b(__a, __b))
507 #define msa_minq_s16(__a, __b) (__builtin_msa_min_s_h(__a, __b))
508 #define msa_minq_s32(__a, __b) (__builtin_msa_min_s_w(__a, __b))
509 #define msa_minq_s64(__a, __b) (__builtin_msa_min_s_d(__a, __b))
510 #define msa_minq_u8(__a, __b) ((v16u8)__builtin_msa_min_u_b(__a, __b))
511 #define msa_minq_u16(__a, __b) ((v8u16)__builtin_msa_min_u_h(__a, __b))
512 #define msa_minq_u32(__a, __b) ((v4u32)__builtin_msa_min_u_w(__a, __b))
513 #define msa_minq_u64(__a, __b) ((v2u64)__builtin_msa_min_u_d(__a, __b))
514 #define msa_minq_f32(__a, __b) (__builtin_msa_fmin_w(__a, __b))
515 #define msa_minq_f64(__a, __b) (__builtin_msa_fmin_d(__a, __b))
516 
517 /* Maximum values between corresponding elements in the two vectors are written to the returned vector. */
518 #define msa_maxq_s8(__a, __b) (__builtin_msa_max_s_b(__a, __b))
519 #define msa_maxq_s16(__a, __b) (__builtin_msa_max_s_h(__a, __b))
520 #define msa_maxq_s32(__a, __b) (__builtin_msa_max_s_w(__a, __b))
521 #define msa_maxq_s64(__a, __b) (__builtin_msa_max_s_d(__a, __b))
522 #define msa_maxq_u8(__a, __b) ((v16u8)__builtin_msa_max_u_b(__a, __b))
523 #define msa_maxq_u16(__a, __b) ((v8u16)__builtin_msa_max_u_h(__a, __b))
524 #define msa_maxq_u32(__a, __b) ((v4u32)__builtin_msa_max_u_w(__a, __b))
525 #define msa_maxq_u64(__a, __b) ((v2u64)__builtin_msa_max_u_d(__a, __b))
526 #define msa_maxq_f32(__a, __b) (__builtin_msa_fmax_w(__a, __b))
527 #define msa_maxq_f64(__a, __b) (__builtin_msa_fmax_d(__a, __b))
528 
529 /* Vector type reinterpretion */
530 #define MSA_TPV_REINTERPRET(_Tpv, Vec) ((_Tpv)(Vec))
531 
532 /* Add the odd elements in vector __a with the even elements in vector __b to double width elements in the returned vector. */
533 /* v8i16 msa_hadd_s16 ((v16i8)__a, (v16i8)__b) */
534 #define msa_hadd_s16(__a, __b) (__builtin_msa_hadd_s_h((v16i8)(__a), (v16i8)(__b)))
535 /* v4i32 msa_hadd_s32 ((v8i16)__a, (v8i16)__b) */
536 #define msa_hadd_s32(__a, __b) (__builtin_msa_hadd_s_w((v8i16)(__a), (v8i16)(__b)))
537 /* v2i64 msa_hadd_s64 ((v4i32)__a, (v4i32)__b) */
538 #define msa_hadd_s64(__a, __b) (__builtin_msa_hadd_s_d((v4i32)(__a), (v4i32)(__b)))
539 
540 /* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */
541 #define msa_pckev_s8(__a, __b) (__builtin_msa_pckev_b((v16i8)(__a), (v16i8)(__b)))
542 #define msa_pckev_s16(__a, __b) (__builtin_msa_pckev_h((v8i16)(__a), (v8i16)(__b)))
543 #define msa_pckev_s32(__a, __b) (__builtin_msa_pckev_w((v4i32)(__a), (v4i32)(__b)))
544 #define msa_pckev_s64(__a, __b) (__builtin_msa_pckev_d((v2i64)(__a), (v2i64)(__b)))
545 
546 /* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */
547 #define msa_pckod_s8(__a, __b) (__builtin_msa_pckod_b((v16i8)(__a), (v16i8)(__b)))
548 #define msa_pckod_s16(__a, __b) (__builtin_msa_pckod_h((v8i16)(__a), (v8i16)(__b)))
549 #define msa_pckod_s32(__a, __b) (__builtin_msa_pckod_w((v4i32)(__a), (v4i32)(__b)))
550 #define msa_pckod_s64(__a, __b) (__builtin_msa_pckod_d((v2i64)(__a), (v2i64)(__b)))
551 
552 #ifdef _MIPSEB
553 #define LANE_IMM0_1(x) (0b1 - ((x) & 0b1))
554 #define LANE_IMM0_3(x) (0b11 - ((x) & 0b11))
555 #define LANE_IMM0_7(x) (0b111 - ((x) & 0b111))
556 #define LANE_IMM0_15(x) (0b1111 - ((x) & 0b1111))
557 #else
558 #define LANE_IMM0_1(x) ((x) & 0b1)
559 #define LANE_IMM0_3(x) ((x) & 0b11)
560 #define LANE_IMM0_7(x) ((x) & 0b111)
561 #define LANE_IMM0_15(x) ((x) & 0b1111)
562 #endif
563 
564 #define msa_get_lane_u8(__a, __b) ((uint8_t)(__a)[LANE_IMM0_7(__b)])
565 #define msa_get_lane_s8(__a, __b) ((int8_t)(__a)[LANE_IMM0_7(__b)])
566 #define msa_get_lane_u16(__a, __b) ((uint16_t)(__a)[LANE_IMM0_3(__b)])
567 #define msa_get_lane_s16(__a, __b) ((int16_t)(__a)[LANE_IMM0_3(__b)])
568 #define msa_get_lane_u32(__a, __b) ((uint32_t)(__a)[LANE_IMM0_1(__b)])
569 #define msa_get_lane_s32(__a, __b) ((int32_t)(__a)[LANE_IMM0_1(__b)])
570 #define msa_get_lane_f32(__a, __b) ((float)(__a)[LANE_IMM0_3(__b)])
571 #define msa_get_lane_s64(__a, __b) ((int64_t)(__a)[LANE_IMM0_1(__b)])
572 #define msa_get_lane_u64(__a, __b) ((uint64_t)(__a)[LANE_IMM0_1(__b)])
573 #define msa_get_lane_f64(__a, __b) ((double)(__a)[LANE_IMM0_1(__b)])
574 #define msa_getq_lane_u8(__a, imm0_15) ((uint8_t)__builtin_msa_copy_u_b((v16i8)(__a), imm0_15))
575 #define msa_getq_lane_s8(__a, imm0_15) ((int8_t)__builtin_msa_copy_s_b(__a, imm0_15))
576 #define msa_getq_lane_u16(__a, imm0_7) ((uint16_t)__builtin_msa_copy_u_h((v8i16)(__a), imm0_7))
577 #define msa_getq_lane_s16(__a, imm0_7) ((int16_t)__builtin_msa_copy_s_h(__a, imm0_7))
578 #define msa_getq_lane_u32(__a, imm0_3) __builtin_msa_copy_u_w((v4i32)(__a), imm0_3)
579 #define msa_getq_lane_s32 __builtin_msa_copy_s_w
580 #define msa_getq_lane_f32(__a, __b) ((float)(__a)[LANE_IMM0_3(__b)])
581 #define msa_getq_lane_f64(__a, __b) ((double)(__a)[LANE_IMM0_1(__b)])
582 #if (__mips == 64)
583 #define msa_getq_lane_u64(__a, imm0_1) __builtin_msa_copy_u_d((v2i64)(__a), imm0_1)
584 #define msa_getq_lane_s64 __builtin_msa_copy_s_d
585 #else
586 #define msa_getq_lane_u64(__a, imm0_1) ((uint64_t)(__a)[LANE_IMM0_1(imm0_1)])
587 #define msa_getq_lane_s64(__a, imm0_1) ((int64_t)(__a)[LANE_IMM0_1(imm0_1)])
588 #endif
589 
590 /* combine */
591 #if (__mips == 64)
592 #define __COMBINE_64_64(__TYPE, a, b) ((__TYPE)((v2u64){((v1u64)(a))[0], ((v1u64)(b))[0]}))
593 #else
594 #define __COMBINE_64_64(__TYPE, a, b) ((__TYPE)((v4u32){((v2u32)(a))[0], ((v2u32)(a))[1], \
595  ((v2u32)(b))[0], ((v2u32)(b))[1]}))
596 #endif
597 
598 /* v16i8 msa_combine_s8 (v8i8 __a, v8i8 __b) */
599 #define msa_combine_s8(__a, __b) __COMBINE_64_64(v16i8, __a, __b)
600 
601 /* v8i16 msa_combine_s16(v4i16 __a, v4i16 __b) */
602 #define msa_combine_s16(__a, __b) __COMBINE_64_64(v8i16, __a, __b)
603 
604 /* v4i32 msa_combine_s32(v2i32 __a, v2i32 __b) */
605 #define msa_combine_s32(__a, __b) __COMBINE_64_64(v4i32, __a, __b)
606 
607 /* v2i64 msa_combine_s64(v1i64 __a, v1i64 __b) */
608 #define msa_combine_s64(__a, __b) __COMBINE_64_64(v2i64, __a, __b)
609 
610 /* v4f32 msa_combine_f32(v2f32 __a, v2f32 __b) */
611 #define msa_combine_f32(__a, __b) __COMBINE_64_64(v4f32, __a, __b)
612 
613 /* v16u8 msa_combine_u8(v8u8 __a, v8u8 __b) */
614 #define msa_combine_u8(__a, __b) __COMBINE_64_64(v16u8, __a, __b)
615 
616 /* v8u16 msa_combine_u16(v4u16 __a, v4u16 __b) */
617 #define msa_combine_u16(__a, __b) __COMBINE_64_64(v8u16, __a, __b)
618 
619 /* v4u32 msa_combine_u32(v2u32 __a, v2u32 __b) */
620 #define msa_combine_u32(__a, __b) __COMBINE_64_64(v4u32, __a, __b)
621 
622 /* v2u64 msa_combine_u64(v1u64 __a, v1u64 __b) */
623 #define msa_combine_u64(__a, __b) __COMBINE_64_64(v2u64, __a, __b)
624 
625 /* v2f64 msa_combine_f64(v1f64 __a, v1f64 __b) */
626 #define msa_combine_f64(__a, __b) __COMBINE_64_64(v2f64, __a, __b)
627 
628 /* get_low, get_high */
629 #if (__mips == 64)
630 #define __GET_LOW(__TYPE, a) ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 0))))
631 #define __GET_HIGH(__TYPE, a) ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 1))))
632 #else
633 #define __GET_LOW(__TYPE, a) ((__TYPE)(((v2u64)(a))[0]))
634 #define __GET_HIGH(__TYPE, a) ((__TYPE)(((v2u64)(a))[1]))
635 #endif
636 
637 /* v8i8 msa_get_low_s8(v16i8 __a) */
638 #define msa_get_low_s8(__a) __GET_LOW(v8i8, __a)
639 
640 /* v4i16 msa_get_low_s16(v8i16 __a) */
641 #define msa_get_low_s16(__a) __GET_LOW(v4i16, __a)
642 
643 /* v2i32 msa_get_low_s32(v4i32 __a) */
644 #define msa_get_low_s32(__a) __GET_LOW(v2i32, __a)
645 
646 /* v1i64 msa_get_low_s64(v2i64 __a) */
647 #define msa_get_low_s64(__a) __GET_LOW(v1i64, __a)
648 
649 /* v8u8 msa_get_low_u8(v16u8 __a) */
650 #define msa_get_low_u8(__a) __GET_LOW(v8u8, __a)
651 
652 /* v4u16 msa_get_low_u16(v8u16 __a) */
653 #define msa_get_low_u16(__a) __GET_LOW(v4u16, __a)
654 
655 /* v2u32 msa_get_low_u32(v4u32 __a) */
656 #define msa_get_low_u32(__a) __GET_LOW(v2u32, __a)
657 
658 /* v1u64 msa_get_low_u64(v2u64 __a) */
659 #define msa_get_low_u64(__a) __GET_LOW(v1u64, __a)
660 
661 /* v2f32 msa_get_low_f32(v4f32 __a) */
662 #define msa_get_low_f32(__a) __GET_LOW(v2f32, __a)
663 
664 /* v1f64 msa_get_low_f64(v2f64 __a) */
665 #define msa_get_low_f64(__a) __GET_LOW(v1f64, __a)
666 
667 /* v8i8 msa_get_high_s8(v16i8 __a) */
668 #define msa_get_high_s8(__a) __GET_HIGH(v8i8, __a)
669 
670 /* v4i16 msa_get_high_s16(v8i16 __a) */
671 #define msa_get_high_s16(__a) __GET_HIGH(v4i16, __a)
672 
673 /* v2i32 msa_get_high_s32(v4i32 __a) */
674 #define msa_get_high_s32(__a) __GET_HIGH(v2i32, __a)
675 
676 /* v1i64 msa_get_high_s64(v2i64 __a) */
677 #define msa_get_high_s64(__a) __GET_HIGH(v1i64, __a)
678 
679 /* v8u8 msa_get_high_u8(v16u8 __a) */
680 #define msa_get_high_u8(__a) __GET_HIGH(v8u8, __a)
681 
682 /* v4u16 msa_get_high_u16(v8u16 __a) */
683 #define msa_get_high_u16(__a) __GET_HIGH(v4u16, __a)
684 
685 /* v2u32 msa_get_high_u32(v4u32 __a) */
686 #define msa_get_high_u32(__a) __GET_HIGH(v2u32, __a)
687 
688 /* v1u64 msa_get_high_u64(v2u64 __a) */
689 #define msa_get_high_u64(__a) __GET_HIGH(v1u64, __a)
690 
691 /* v2f32 msa_get_high_f32(v4f32 __a) */
692 #define msa_get_high_f32(__a) __GET_HIGH(v2f32, __a)
693 
694 /* v1f64 msa_get_high_f64(v2f64 __a) */
695 #define msa_get_high_f64(__a) __GET_HIGH(v1f64, __a)
696 
697 /* ri = ai * b[lane] */
698 /* v4f32 msa_mulq_lane_f32(v4f32 __a, v4f32 __b, const int __lane) */
699 #define msa_mulq_lane_f32(__a, __b, __lane) ((__a) * msa_getq_lane_f32(__b, __lane))
700 
701 /* ri = ai + bi * c[lane] */
702 /* v4f32 msa_mlaq_lane_f32(v4f32 __a, v4f32 __b, v4f32 __c, const int __lane) */
703 #define msa_mlaq_lane_f32(__a, __b, __c, __lane) ((__a) + ((__b) * msa_getq_lane_f32(__c, __lane)))
704 
705 /* uint16_t msa_sum_u16(v8u16 __a)*/
706 #define msa_sum_u16(__a) \
707 ({ \
708  v4u32 _b; \
709  v2u64 _c; \
710  _b = __builtin_msa_hadd_u_w(__a, __a); \
711  _c = __builtin_msa_hadd_u_d(_b, _b); \
712  (uint16_t)(_c[0] + _c[1]); \
713 })
714 
715 /* int16_t msa_sum_s16(v8i16 __a) */
716 #define msa_sum_s16(__a) \
717 ({ \
718  v4i32 _b; \
719  v2i64 _c; \
720  _b = __builtin_msa_hadd_s_w(__a, __a); \
721  _c = __builtin_msa_hadd_s_d(_b, _b); \
722  (int32_t)(_c[0] + _c[1]); \
723 })
724 
725 
726 /* uint32_t msa_sum_u32(v4u32 __a)*/
727 #define msa_sum_u32(__a) \
728 ({ \
729  v2u64 _b; \
730  _b = __builtin_msa_hadd_u_d(__a, __a); \
731  (uint32_t)(_b[0] + _b[1]); \
732 })
733 
734 /* int32_t msa_sum_s32(v4i32 __a)*/
735 #define msa_sum_s32(__a) \
736 ({ \
737  v2i64 _b; \
738  _b = __builtin_msa_hadd_s_d(__a, __a); \
739  (int64_t)(_b[0] + _b[1]); \
740 })
741 
742 /* uint8_t msa_sum_u8(v16u8 __a)*/
743 #define msa_sum_u8(__a) \
744 ({ \
745  v8u16 _b16; \
746  v4u32 _c32; \
747  _b16 = __builtin_msa_hadd_u_h(__a, __a); \
748  _c32 = __builtin_msa_hadd_u_w(_b16, _b16); \
749  (uint8_t)msa_sum_u32(_c32); \
750 })
751 
752 /* int8_t msa_sum_s8(v16s8 __a)*/
753 #define msa_sum_s8(__a) \
754 ({ \
755  v8i16 _b16; \
756  v4i32 _c32; \
757  _b16 = __builtin_msa_hadd_s_h(__a, __a); \
758  _c32 = __builtin_msa_hadd_s_w(_b16, _b16); \
759  (int16_t)msa_sum_s32(_c32); \
760 })
761 
762 /* float msa_sum_f32(v4f32 __a)*/
763 #define msa_sum_f32(__a) ((__a)[0] + (__a)[1] + (__a)[2] + (__a)[3])
764 
765 /* v8u16 msa_paddlq_u8(v16u8 __a) */
766 #define msa_paddlq_u8(__a) (__builtin_msa_hadd_u_h(__a, __a))
767 
768 /* v8i16 msa_paddlq_s8(v16i8 __a) */
769 #define msa_paddlq_s8(__a) (__builtin_msa_hadd_s_h(__a, __a))
770 
771 /* v4u32 msa_paddlq_u16 (v8u16 __a)*/
772 #define msa_paddlq_u16(__a) (__builtin_msa_hadd_u_w(__a, __a))
773 
774 /* v4i32 msa_paddlq_s16 (v8i16 __a)*/
775 #define msa_paddlq_s16(__a) (__builtin_msa_hadd_s_w(__a, __a))
776 
777 /* v2u64 msa_paddlq_u32(v4u32 __a) */
778 #define msa_paddlq_u32(__a) (__builtin_msa_hadd_u_d(__a, __a))
779 
780 /* v2i64 msa_paddlq_s32(v4i32 __a) */
781 #define msa_paddlq_s32(__a) (__builtin_msa_hadd_s_d(__a, __a))
782 
783 #define V8U8_2_V8U16(x) {(uint16_t)x[0], (uint16_t)x[1], (uint16_t)x[2], (uint16_t)x[3], \
784  (uint16_t)x[4], (uint16_t)x[5], (uint16_t)x[6], (uint16_t)x[7]}
785 #define V8U8_2_V8I16(x) {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
786  (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
787 #define V8I8_2_V8I16(x) {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
788  (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
789 #define V4U16_2_V4U32(x) {(uint32_t)x[0], (uint32_t)x[1], (uint32_t)x[2], (uint32_t)x[3]}
790 #define V4U16_2_V4I32(x) {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
791 #define V4I16_2_V4I32(x) {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
792 #define V2U32_2_V2U64(x) {(uint64_t)x[0], (uint64_t)x[1]}
793 #define V2U32_2_V2I64(x) {(int64_t)x[0], (int64_t)x[1]}
794 
795 /* v8u16 msa_mull_u8(v8u8 __a, v8u8 __b) */
796 #define msa_mull_u8(__a, __b) ((v8u16)__builtin_msa_mulv_h((v8i16)V8U8_2_V8I16(__a), (v8i16)V8U8_2_V8I16(__b)))
797 
798 /* v8i16 msa_mull_s8(v8i8 __a, v8i8 __b)*/
799 #define msa_mull_s8(__a, __b) (__builtin_msa_mulv_h((v8i16)V8I8_2_V8I16(__a), (v8i16)V8I8_2_V8I16(__b)))
800 
801 /* v4u32 msa_mull_u16(v4u16 __a, v4u16 __b) */
802 #define msa_mull_u16(__a, __b) ((v4u32)__builtin_msa_mulv_w((v4i32)V4U16_2_V4I32(__a), (v4i32)V4U16_2_V4I32(__b)))
803 
804 /* v4i32 msa_mull_s16(v4i16 __a, v4i16 __b) */
805 #define msa_mull_s16(__a, __b) (__builtin_msa_mulv_w((v4i32)V4I16_2_V4I32(__a), (v4i32)V4I16_2_V4I32(__b)))
806 
807 /* v2u64 msa_mull_u32(v2u32 __a, v2u32 __b) */
808 #define msa_mull_u32(__a, __b) ((v2u64)__builtin_msa_mulv_d((v2i64)V2U32_2_V2I64(__a), (v2i64)V2U32_2_V2I64(__b)))
809 
810 /* bitwise and: __builtin_msa_and_v */
811 #define msa_andq_u8(__a, __b) ((v16u8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
812 #define msa_andq_s8(__a, __b) ((v16i8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
813 #define msa_andq_u16(__a, __b) ((v8u16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
814 #define msa_andq_s16(__a, __b) ((v8i16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
815 #define msa_andq_u32(__a, __b) ((v4u32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
816 #define msa_andq_s32(__a, __b) ((v4i32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
817 #define msa_andq_u64(__a, __b) ((v2u64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
818 #define msa_andq_s64(__a, __b) ((v2i64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
819 
820 /* bitwise or: __builtin_msa_or_v */
821 #define msa_orrq_u8(__a, __b) ((v16u8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
822 #define msa_orrq_s8(__a, __b) ((v16i8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
823 #define msa_orrq_u16(__a, __b) ((v8u16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
824 #define msa_orrq_s16(__a, __b) ((v8i16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
825 #define msa_orrq_u32(__a, __b) ((v4u32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
826 #define msa_orrq_s32(__a, __b) ((v4i32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
827 #define msa_orrq_u64(__a, __b) ((v2u64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
828 #define msa_orrq_s64(__a, __b) ((v2i64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
829 
830 /* bitwise xor: __builtin_msa_xor_v */
831 #define msa_eorq_u8(__a, __b) ((v16u8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
832 #define msa_eorq_s8(__a, __b) ((v16i8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
833 #define msa_eorq_u16(__a, __b) ((v8u16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
834 #define msa_eorq_s16(__a, __b) ((v8i16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
835 #define msa_eorq_u32(__a, __b) ((v4u32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
836 #define msa_eorq_s32(__a, __b) ((v4i32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
837 #define msa_eorq_u64(__a, __b) ((v2u64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
838 #define msa_eorq_s64(__a, __b) ((v2i64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
839 
840 /* bitwise not: v16u8 __builtin_msa_xori_b (v16u8, 0xff) */
841 #define msa_mvnq_u8(__a) ((v16u8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
842 #define msa_mvnq_s8(__a) ((v16i8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
843 #define msa_mvnq_u16(__a) ((v8u16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
844 #define msa_mvnq_s16(__a) ((v8i16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
845 #define msa_mvnq_u32(__a) ((v4u32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
846 #define msa_mvnq_s32(__a) ((v4i32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
847 #define msa_mvnq_u64(__a) ((v2u64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
848 #define msa_mvnq_s64(__a) ((v2i64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
849 
850 /* compare equal: ceq -> ri = ai == bi ? 1...1:0...0 */
851 #define msa_ceqq_u8(__a, __b) ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
852 #define msa_ceqq_s8(__a, __b) ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
853 #define msa_ceqq_u16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
854 #define msa_ceqq_s16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
855 #define msa_ceqq_u32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
856 #define msa_ceqq_s32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
857 #define msa_ceqq_f32(__a, __b) ((v4u32)__builtin_msa_fceq_w((v4f32)(__a), (v4f32)(__b)))
858 #define msa_ceqq_u64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
859 #define msa_ceqq_s64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
860 #define msa_ceqq_f64(__a, __b) ((v2u64)__builtin_msa_fceq_d((v2f64)(__a), (v2f64)(__b)))
861 
862 /* Compare less-than: clt -> ri = ai < bi ? 1...1:0...0 */
863 #define msa_cltq_u8(__a, __b) ((v16u8)__builtin_msa_clt_u_b((v16u8)(__a), (v16u8)(__b)))
864 #define msa_cltq_s8(__a, __b) ((v16u8)__builtin_msa_clt_s_b((v16i8)(__a), (v16i8)(__b)))
865 #define msa_cltq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__a), (v8u16)(__b)))
866 #define msa_cltq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__a), (v8i16)(__b)))
867 #define msa_cltq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__a), (v4u32)(__b)))
868 #define msa_cltq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__a), (v4i32)(__b)))
869 #define msa_cltq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__a), (v4f32)(__b)))
870 #define msa_cltq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__a), (v2u64)(__b)))
871 #define msa_cltq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__a), (v2i64)(__b)))
872 #define msa_cltq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__a), (v2f64)(__b)))
873 
874 /* compare greater-than: cgt -> ri = ai > bi ? 1...1:0...0 */
875 #define msa_cgtq_u8(__a, __b) ((v16u8)__builtin_msa_clt_u_b((v16u8)(__b), (v16u8)(__a)))
876 #define msa_cgtq_s8(__a, __b) ((v16u8)__builtin_msa_clt_s_b((v16i8)(__b), (v16i8)(__a)))
877 #define msa_cgtq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__b), (v8u16)(__a)))
878 #define msa_cgtq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__b), (v8i16)(__a)))
879 #define msa_cgtq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__b), (v4u32)(__a)))
880 #define msa_cgtq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__b), (v4i32)(__a)))
881 #define msa_cgtq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__b), (v4f32)(__a)))
882 #define msa_cgtq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__b), (v2u64)(__a)))
883 #define msa_cgtq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__b), (v2i64)(__a)))
884 #define msa_cgtq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__b), (v2f64)(__a)))
885 
886 /* compare less-equal: cle -> ri = ai <= bi ? 1...1:0...0 */
887 #define msa_cleq_u8(__a, __b) ((v16u8)__builtin_msa_cle_u_b((v16u8)(__a), (v16u8)(__b)))
888 #define msa_cleq_s8(__a, __b) ((v16u8)__builtin_msa_cle_s_b((v16i8)(__a), (v16i8)(__b)))
889 #define msa_cleq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__a), (v8u16)(__b)))
890 #define msa_cleq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__a), (v8i16)(__b)))
891 #define msa_cleq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__a), (v4u32)(__b)))
892 #define msa_cleq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__a), (v4i32)(__b)))
893 #define msa_cleq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__a), (v4f32)(__b)))
894 #define msa_cleq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__a), (v2u64)(__b)))
895 #define msa_cleq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__a), (v2i64)(__b)))
896 #define msa_cleq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__a), (v2f64)(__b)))
897 
898 /* compare greater-equal: cge -> ri = ai >= bi ? 1...1:0...0 */
899 #define msa_cgeq_u8(__a, __b) ((v16u8)__builtin_msa_cle_u_b((v16u8)(__b), (v16u8)(__a)))
900 #define msa_cgeq_s8(__a, __b) ((v16u8)__builtin_msa_cle_s_b((v16i8)(__b), (v16i8)(__a)))
901 #define msa_cgeq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__b), (v8u16)(__a)))
902 #define msa_cgeq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__b), (v8i16)(__a)))
903 #define msa_cgeq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__b), (v4u32)(__a)))
904 #define msa_cgeq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__b), (v4i32)(__a)))
905 #define msa_cgeq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__b), (v4f32)(__a)))
906 #define msa_cgeq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__b), (v2u64)(__a)))
907 #define msa_cgeq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__b), (v2i64)(__a)))
908 #define msa_cgeq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__b), (v2f64)(__a)))
909 
910 /* Shift Left Logical: shl -> ri = ai << bi; */
911 #define msa_shlq_u8(__a, __b) ((v16u8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
912 #define msa_shlq_s8(__a, __b) ((v16i8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
913 #define msa_shlq_u16(__a, __b) ((v8u16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
914 #define msa_shlq_s16(__a, __b) ((v8i16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
915 #define msa_shlq_u32(__a, __b) ((v4u32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
916 #define msa_shlq_s32(__a, __b) ((v4i32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
917 #define msa_shlq_u64(__a, __b) ((v2u64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
918 #define msa_shlq_s64(__a, __b) ((v2i64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
919 
920 /* Immediate Shift Left Logical: shl -> ri = ai << imm; */
921 #define msa_shlq_n_u8(__a, __imm) ((v16u8)__builtin_msa_slli_b((v16i8)(__a), __imm))
922 #define msa_shlq_n_s8(__a, __imm) ((v16i8)__builtin_msa_slli_b((v16i8)(__a), __imm))
923 #define msa_shlq_n_u16(__a, __imm) ((v8u16)__builtin_msa_slli_h((v8i16)(__a), __imm))
924 #define msa_shlq_n_s16(__a, __imm) ((v8i16)__builtin_msa_slli_h((v8i16)(__a), __imm))
925 #define msa_shlq_n_u32(__a, __imm) ((v4u32)__builtin_msa_slli_w((v4i32)(__a), __imm))
926 #define msa_shlq_n_s32(__a, __imm) ((v4i32)__builtin_msa_slli_w((v4i32)(__a), __imm))
927 #define msa_shlq_n_u64(__a, __imm) ((v2u64)__builtin_msa_slli_d((v2i64)(__a), __imm))
928 #define msa_shlq_n_s64(__a, __imm) ((v2i64)__builtin_msa_slli_d((v2i64)(__a), __imm))
929 
930 /* shift right: shrq -> ri = ai >> bi; */
931 #define msa_shrq_u8(__a, __b) ((v16u8)__builtin_msa_srl_b((v16i8)(__a), (v16i8)(__b)))
932 #define msa_shrq_s8(__a, __b) ((v16i8)__builtin_msa_sra_b((v16i8)(__a), (v16i8)(__b)))
933 #define msa_shrq_u16(__a, __b) ((v8u16)__builtin_msa_srl_h((v8i16)(__a), (v8i16)(__b)))
934 #define msa_shrq_s16(__a, __b) ((v8i16)__builtin_msa_sra_h((v8i16)(__a), (v8i16)(__b)))
935 #define msa_shrq_u32(__a, __b) ((v4u32)__builtin_msa_srl_w((v4i32)(__a), (v4i32)(__b)))
936 #define msa_shrq_s32(__a, __b) ((v4i32)__builtin_msa_sra_w((v4i32)(__a), (v4i32)(__b)))
937 #define msa_shrq_u64(__a, __b) ((v2u64)__builtin_msa_srl_d((v2i64)(__a), (v2i64)(__b)))
938 #define msa_shrq_s64(__a, __b) ((v2i64)__builtin_msa_sra_d((v2i64)(__a), (v2i64)(__b)))
939 
940 /* Immediate Shift Right: shr -> ri = ai >> imm; */
941 #define msa_shrq_n_u8(__a, __imm) ((v16u8)__builtin_msa_srli_b((v16i8)(__a), __imm))
942 #define msa_shrq_n_s8(__a, __imm) ((v16i8)__builtin_msa_srai_b((v16i8)(__a), __imm))
943 #define msa_shrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srli_h((v8i16)(__a), __imm))
944 #define msa_shrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srai_h((v8i16)(__a), __imm))
945 #define msa_shrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srli_w((v4i32)(__a), __imm))
946 #define msa_shrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srai_w((v4i32)(__a), __imm))
947 #define msa_shrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srli_d((v2i64)(__a), __imm))
948 #define msa_shrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srai_d((v2i64)(__a), __imm))
949 
950 /* Immediate Shift Right Rounded: shr -> ri = ai >> (rounded)imm; */
951 #define msa_rshrq_n_u8(__a, __imm) ((v16u8)__builtin_msa_srlri_b((v16i8)(__a), __imm))
952 #define msa_rshrq_n_s8(__a, __imm) ((v16i8)__builtin_msa_srari_b((v16i8)(__a), __imm))
953 #define msa_rshrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srlri_h((v8i16)(__a), __imm))
954 #define msa_rshrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srari_h((v8i16)(__a), __imm))
955 #define msa_rshrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srlri_w((v4i32)(__a), __imm))
956 #define msa_rshrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srari_w((v4i32)(__a), __imm))
957 #define msa_rshrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srlri_d((v2i64)(__a), __imm))
958 #define msa_rshrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srari_d((v2i64)(__a), __imm))
959 
960 /* Vector saturating rounding shift left, qrshl -> ri = ai << bi; */
961 #define msa_qrshrq_s32(a, b) ((v4i32)__msa_srar_w((v4i32)(a), (v4i32)(b)))
962 
963 /* Rename the msa builtin func to unify the name style for intrin_msa.hpp */
964 #define msa_qaddq_u8 __builtin_msa_adds_u_b
965 #define msa_qaddq_s8 __builtin_msa_adds_s_b
966 #define msa_qaddq_u16 __builtin_msa_adds_u_h
967 #define msa_qaddq_s16 __builtin_msa_adds_s_h
968 #define msa_qaddq_u32 __builtin_msa_adds_u_w
969 #define msa_qaddq_s32 __builtin_msa_adds_s_w
970 #define msa_qaddq_u64 __builtin_msa_adds_u_d
971 #define msa_qaddq_s64 __builtin_msa_adds_s_d
972 #define msa_addq_u8(a, b) ((v16u8)__builtin_msa_addv_b((v16i8)(a), (v16i8)(b)))
973 #define msa_addq_s8 __builtin_msa_addv_b
974 #define msa_addq_u16(a, b) ((v8u16)__builtin_msa_addv_h((v8i16)(a), (v8i16)(b)))
975 #define msa_addq_s16 __builtin_msa_addv_h
976 #define msa_addq_u32(a, b) ((v4u32)__builtin_msa_addv_w((v4i32)(a), (v4i32)(b)))
977 #define msa_addq_s32 __builtin_msa_addv_w
978 #define msa_addq_f32 __builtin_msa_fadd_w
979 #define msa_addq_u64(a, b) ((v2u64)__builtin_msa_addv_d((v2i64)(a), (v2i64)(b)))
980 #define msa_addq_s64 __builtin_msa_addv_d
981 #define msa_addq_f64 __builtin_msa_fadd_d
982 #define msa_qsubq_u8 __builtin_msa_subs_u_b
983 #define msa_qsubq_s8 __builtin_msa_subs_s_b
984 #define msa_qsubq_u16 __builtin_msa_subs_u_h
985 #define msa_qsubq_s16 __builtin_msa_subs_s_h
986 #define msa_subq_u8(a, b) ((v16u8)__builtin_msa_subv_b((v16i8)(a), (v16i8)(b)))
987 #define msa_subq_s8 __builtin_msa_subv_b
988 #define msa_subq_u16(a, b) ((v8u16)__builtin_msa_subv_h((v8i16)(a), (v8i16)(b)))
989 #define msa_subq_s16 __builtin_msa_subv_h
990 #define msa_subq_u32(a, b) ((v4u32)__builtin_msa_subv_w((v4i32)(a), (v4i32)(b)))
991 #define msa_subq_s32 __builtin_msa_subv_w
992 #define msa_subq_f32 __builtin_msa_fsub_w
993 #define msa_subq_u64(a, b) ((v2u64)__builtin_msa_subv_d((v2i64)(a), (v2i64)(b)))
994 #define msa_subq_s64 __builtin_msa_subv_d
995 #define msa_subq_f64 __builtin_msa_fsub_d
996 #define msa_mulq_u8(a, b) ((v16u8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
997 #define msa_mulq_s8(a, b) ((v16i8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
998 #define msa_mulq_u16(a, b) ((v8u16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
999 #define msa_mulq_s16(a, b) ((v8i16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
1000 #define msa_mulq_u32(a, b) ((v4u32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
1001 #define msa_mulq_s32(a, b) ((v4i32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
1002 #define msa_mulq_u64(a, b) ((v2u64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
1003 #define msa_mulq_s64(a, b) ((v2i64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
1004 #define msa_mulq_f32 __builtin_msa_fmul_w
1005 #define msa_mulq_f64 __builtin_msa_fmul_d
1006 #define msa_divq_f32 __builtin_msa_fdiv_w
1007 #define msa_divq_f64 __builtin_msa_fdiv_d
1008 #define msa_dotp_s_h __builtin_msa_dotp_s_h
1009 #define msa_dotp_s_w __builtin_msa_dotp_s_w
1010 #define msa_dotp_s_d __builtin_msa_dotp_s_d
1011 #define msa_dotp_u_h __builtin_msa_dotp_u_h
1012 #define msa_dotp_u_w __builtin_msa_dotp_u_w
1013 #define msa_dotp_u_d __builtin_msa_dotp_u_d
1014 #define msa_dpadd_s_h __builtin_msa_dpadd_s_h
1015 #define msa_dpadd_s_w __builtin_msa_dpadd_s_w
1016 #define msa_dpadd_s_d __builtin_msa_dpadd_s_d
1017 #define msa_dpadd_u_h __builtin_msa_dpadd_u_h
1018 #define msa_dpadd_u_w __builtin_msa_dpadd_u_w
1019 #define msa_dpadd_u_d __builtin_msa_dpadd_u_d
1020 
1021 #define ILVRL_B2(RTYPE, in0, in1, low, hi) do { \
1022  low = (RTYPE)__builtin_msa_ilvr_b((v16i8)(in0), (v16i8)(in1)); \
1023  hi = (RTYPE)__builtin_msa_ilvl_b((v16i8)(in0), (v16i8)(in1)); \
1024  } while (0)
1025 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1026 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1027 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1028 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1029 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1030 
1031 #define ILVRL_H2(RTYPE, in0, in1, low, hi) do { \
1032  low = (RTYPE)__builtin_msa_ilvr_h((v8i16)(in0), (v8i16)(in1)); \
1033  hi = (RTYPE)__builtin_msa_ilvl_h((v8i16)(in0), (v8i16)(in1)); \
1034  } while (0)
1035 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1036 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1037 #define ILVRL_H2_UH(...) ILVRL_H2(v8u16, __VA_ARGS__)
1038 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1039 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1040 #define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
1041 
1042 #define ILVRL_W2(RTYPE, in0, in1, low, hi) do { \
1043  low = (RTYPE)__builtin_msa_ilvr_w((v4i32)(in0), (v4i32)(in1)); \
1044  hi = (RTYPE)__builtin_msa_ilvl_w((v4i32)(in0), (v4i32)(in1)); \
1045  } while (0)
1046 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1047 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1048 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1049 #define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
1050 
1051 /* absq, qabsq (r = |a|;) */
1052 #define msa_absq_s8(a) __builtin_msa_add_a_b(a, __builtin_msa_fill_b(0))
1053 #define msa_absq_s16(a) __builtin_msa_add_a_h(a, __builtin_msa_fill_h(0))
1054 #define msa_absq_s32(a) __builtin_msa_add_a_w(a, __builtin_msa_fill_w(0))
1055 #define msa_absq_s64(a) __builtin_msa_add_a_d(a, __builtin_msa_fill_d(0))
1056 #define msa_absq_f32(a) ((v4f32)__builtin_msa_bclri_w((v4u32)(a), 31))
1057 #define msa_absq_f64(a) ((v2f64)__builtin_msa_bclri_d((v2u64)(a), 63))
1058 #define msa_qabsq_s8(a) __builtin_msa_adds_a_b(a, __builtin_msa_fill_b(0))
1059 #define msa_qabsq_s16(a) __builtin_msa_adds_a_h(a, __builtin_msa_fill_h(0))
1060 #define msa_qabsq_s32(a) __builtin_msa_adds_a_w(a, __builtin_msa_fill_w(0))
1061 #define msa_qabsq_s64(a) __builtin_msa_adds_a_d(a, __builtin_msa_fill_d(0))
1062 
1063 /* abdq, qabdq (r = |a - b|;) */
1064 #define msa_abdq_u8 __builtin_msa_asub_u_b
1065 #define msa_abdq_s8 __builtin_msa_asub_s_b
1066 #define msa_abdq_u16 __builtin_msa_asub_u_h
1067 #define msa_abdq_s16 __builtin_msa_asub_s_h
1068 #define msa_abdq_u32 __builtin_msa_asub_u_w
1069 #define msa_abdq_s32 __builtin_msa_asub_s_w
1070 #define msa_abdq_u64 __builtin_msa_asub_u_d
1071 #define msa_abdq_s64 __builtin_msa_asub_s_d
1072 #define msa_abdq_f32(a, b) msa_absq_f32(__builtin_msa_fsub_w(a, b))
1073 #define msa_abdq_f64(a, b) msa_absq_f64(__builtin_msa_fsub_d(a, b))
1074 #define msa_qabdq_s8(a, b) msa_qabsq_s8(__builtin_msa_subs_s_b(a, b))
1075 #define msa_qabdq_s16(a, b) msa_qabsq_s16(__builtin_msa_subs_s_h(a, b))
1076 #define msa_qabdq_s32(a, b) msa_qabsq_s32(__builtin_msa_subs_s_w(a, b))
1077 #define msa_qabdq_s64(a, b) msa_qabsq_s64(__builtin_msa_subs_s_d(a, b))
1078 
1079 /* sqrtq, rsqrtq */
1080 #define msa_sqrtq_f32 __builtin_msa_fsqrt_w
1081 #define msa_sqrtq_f64 __builtin_msa_fsqrt_d
1082 #define msa_rsqrtq_f32 __builtin_msa_frsqrt_w
1083 #define msa_rsqrtq_f64 __builtin_msa_frsqrt_d
1084 
1085 
1086 /* mlaq: r = a + b * c; */
1087 __extension__ extern __inline v4i32
1088 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
1089 msa_mlaq_s32(v4i32 __a, v4i32 __b, v4i32 __c)
1090 {
1091  __asm__ volatile("maddv.w %w[__a], %w[__b], %w[__c]\n"
1092  // Outputs
1093  : [__a] "+f"(__a)
1094  // Inputs
1095  : [__b] "f"(__b), [__c] "f"(__c));
1096  return __a;
1097 }
1098 
1099 __extension__ extern __inline v2i64
1100 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
1101 msa_mlaq_s64(v2i64 __a, v2i64 __b, v2i64 __c)
1102 {
1103  __asm__ volatile("maddv.d %w[__a], %w[__b], %w[__c]\n"
1104  // Outputs
1105  : [__a] "+f"(__a)
1106  // Inputs
1107  : [__b] "f"(__b), [__c] "f"(__c));
1108  return __a;
1109 }
1110 
1111 __extension__ extern __inline v4f32
1112 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
1113 msa_mlaq_f32(v4f32 __a, v4f32 __b, v4f32 __c)
1114 {
1115  __asm__ volatile("fmadd.w %w[__a], %w[__b], %w[__c]\n"
1116  // Outputs
1117  : [__a] "+f"(__a)
1118  // Inputs
1119  : [__b] "f"(__b), [__c] "f"(__c));
1120  return __a;
1121 }
1122 
1123 __extension__ extern __inline v2f64
1124 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
1125 msa_mlaq_f64(v2f64 __a, v2f64 __b, v2f64 __c)
1126 {
1127  __asm__ volatile("fmadd.d %w[__a], %w[__b], %w[__c]\n"
1128  // Outputs
1129  : [__a] "+f"(__a)
1130  // Inputs
1131  : [__b] "f"(__b), [__c] "f"(__c));
1132  return __a;
1133 }
1134 
1135 /* cntq */
1136 #define msa_cntq_s8 __builtin_msa_pcnt_b
1137 #define msa_cntq_s16 __builtin_msa_pcnt_h
1138 #define msa_cntq_s32 __builtin_msa_pcnt_w
1139 #define msa_cntq_s64 __builtin_msa_pcnt_d
1140 
1141 /* bslq (a: mask; r = b(if a == 0); r = c(if a == 1);) */
1142 #define msa_bslq_u8 __builtin_msa_bsel_v
1143 
1144 /* ilvrq, ilvlq (For EL only, ilvrq: b0, a0, b1, a1; ilvlq: b2, a2, b3, a3;) */
1145 #define msa_ilvrq_s8 __builtin_msa_ilvr_b
1146 #define msa_ilvrq_s16 __builtin_msa_ilvr_h
1147 #define msa_ilvrq_s32 __builtin_msa_ilvr_w
1148 #define msa_ilvrq_s64 __builtin_msa_ilvr_d
1149 #define msa_ilvlq_s8 __builtin_msa_ilvl_b
1150 #define msa_ilvlq_s16 __builtin_msa_ilvl_h
1151 #define msa_ilvlq_s32 __builtin_msa_ilvl_w
1152 #define msa_ilvlq_s64 __builtin_msa_ilvl_d
1153 
1154 /* ilvevq, ilvodq (ilvevq: b0, a0, b2, a2; ilvodq: b1, a1, b3, a3; ) */
1155 #define msa_ilvevq_s8 __builtin_msa_ilvev_b
1156 #define msa_ilvevq_s16 __builtin_msa_ilvev_h
1157 #define msa_ilvevq_s32 __builtin_msa_ilvev_w
1158 #define msa_ilvevq_s64 __builtin_msa_ilvev_d
1159 #define msa_ilvodq_s8 __builtin_msa_ilvod_b
1160 #define msa_ilvodq_s16 __builtin_msa_ilvod_h
1161 #define msa_ilvodq_s32 __builtin_msa_ilvod_w
1162 #define msa_ilvodq_s64 __builtin_msa_ilvod_d
1163 
1164 /* extq (r = (a || b); a concatenation b and get elements from index c) */
1165 #ifdef _MIPSEB
1166 #define msa_extq_s8(a, b, c) \
1167 (__builtin_msa_vshf_b(__builtin_msa_subv_b((v16i8)((v2i64){0x1716151413121110, 0x1F1E1D1C1B1A1918}), __builtin_msa_fill_b(c)), a, b))
1168 #define msa_extq_s16(a, b, c) \
1169 (__builtin_msa_vshf_h(__builtin_msa_subv_h((v8i16)((v2i64){0x000B000A00090008, 0x000F000E000D000C}), __builtin_msa_fill_h(c)), a, b))
1170 #define msa_extq_s32(a, b, c) \
1171 (__builtin_msa_vshf_w(__builtin_msa_subv_w((v4i32)((v2i64){0x0000000500000004, 0x0000000700000006}), __builtin_msa_fill_w(c)), a, b))
1172 #define msa_extq_s64(a, b, c) \
1173 (__builtin_msa_vshf_d(__builtin_msa_subv_d((v2i64){0x0000000000000002, 0x0000000000000003}, __builtin_msa_fill_d(c)), a, b))
1174 #else
1175 #define msa_extq_s8(a, b, c) \
1176 (__builtin_msa_vshf_b(__builtin_msa_addv_b((v16i8)((v2i64){0x0706050403020100, 0x0F0E0D0C0B0A0908}), __builtin_msa_fill_b(c)), b, a))
1177 #define msa_extq_s16(a, b, c) \
1178 (__builtin_msa_vshf_h(__builtin_msa_addv_h((v8i16)((v2i64){0x0003000200010000, 0x0007000600050004}), __builtin_msa_fill_h(c)), b, a))
1179 #define msa_extq_s32(a, b, c) \
1180 (__builtin_msa_vshf_w(__builtin_msa_addv_w((v4i32)((v2i64){0x0000000100000000, 0x0000000300000002}), __builtin_msa_fill_w(c)), b, a))
1181 #define msa_extq_s64(a, b, c) \
1182 (__builtin_msa_vshf_d(__builtin_msa_addv_d((v2i64){0x0000000000000000, 0x0000000000000001}, __builtin_msa_fill_d(c)), b, a))
1183 #endif /* _MIPSEB */
1184 
1185 /* cvttruncq, cvttintq, cvtrintq */
1186 #define msa_cvttruncq_u32_f32 __builtin_msa_ftrunc_u_w
1187 #define msa_cvttruncq_s32_f32 __builtin_msa_ftrunc_s_w
1188 #define msa_cvttruncq_u64_f64 __builtin_msa_ftrunc_u_d
1189 #define msa_cvttruncq_s64_f64 __builtin_msa_ftrunc_s_d
1190 #define msa_cvttintq_u32_f32 __builtin_msa_ftint_u_w
1191 #define msa_cvttintq_s32_f32 __builtin_msa_ftint_s_w
1192 #define msa_cvttintq_u64_f64 __builtin_msa_ftint_u_d
1193 #define msa_cvttintq_s64_f64 __builtin_msa_ftint_s_d
1194 #define msa_cvtrintq_f32 __builtin_msa_frint_w
1195 #define msa_cvtrintq_f64 __builtin_msa_frint_d
1196 
1197 /* cvtfintq, cvtfq */
1198 #define msa_cvtfintq_f32_u32 __builtin_msa_ffint_u_w
1199 #define msa_cvtfintq_f32_s32 __builtin_msa_ffint_s_w
1200 #define msa_cvtfintq_f64_u64 __builtin_msa_ffint_u_d
1201 #define msa_cvtfintq_f64_s64 __builtin_msa_ffint_s_d
1202 #define msa_cvtfq_f32_f64 __builtin_msa_fexdo_w
1203 #define msa_cvtflq_f64_f32 __builtin_msa_fexupr_d
1204 #define msa_cvtfhq_f64_f32 __builtin_msa_fexupl_d
1205 
1206 #define msa_addl_u8(a, b) ((v8u16)__builtin_msa_addv_h((v8i16)V8U8_2_V8I16(a), (v8i16)V8U8_2_V8I16(b)))
1207 #define msa_addl_s8(a, b) (__builtin_msa_addv_h((v8i16)V8I8_2_V8I16(a), (v8i16)V8I8_2_V8I16(b)))
1208 #define msa_addl_u16(a, b) ((v4u32)__builtin_msa_addv_w((v4i32)V4U16_2_V4I32(a), (v4i32)V4U16_2_V4I32(b)))
1209 #define msa_addl_s16(a, b) (__builtin_msa_addv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
1210 #define msa_subl_s16(a, b) (__builtin_msa_subv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
1211 #define msa_recpeq_f32 __builtin_msa_frcp_w
1212 #define msa_recpsq_f32(a, b) (__builtin_msa_fsub_w(msa_dupq_n_f32(2.0f), __builtin_msa_fmul_w(a, b)))
1213 
1214 #define MSA_INTERLEAVED_IMPL_LOAD2_STORE2(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
1215 __extension__ extern __inline void \
1216 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1217 msa_ld2q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b) \
1218 { \
1219  _Tpv v0 = msa_ld1q_##suffix(ptr); \
1220  _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
1221  *a = (_Tpv)__builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
1222  *b = (_Tpv)__builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
1223 } \
1224 __extension__ extern __inline void \
1225 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1226 msa_st2q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b) \
1227 { \
1228  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df((_Tpvs)b, (_Tpvs)a)); \
1229  msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df((_Tpvs)b, (_Tpvs)a)); \
1230 }
1231 
1232 MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint8_t, v16u8, v16i8, u8, b, 16)
1233 MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int8_t, v16i8, v16i8, s8, b, 16)
1234 MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint16_t, v8u16, v8i16, u16, h, 8)
1235 MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int16_t, v8i16, v8i16, s16, h, 8)
1236 MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint32_t, v4u32, v4i32, u32, w, 4)
1237 MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int32_t, v4i32, v4i32, s32, w, 4)
1238 MSA_INTERLEAVED_IMPL_LOAD2_STORE2(float, v4f32, v4i32, f32, w, 4)
1239 MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint64_t, v2u64, v2i64, u64, d, 2)
1240 MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int64_t, v2i64, v2i64, s64, d, 2)
1241 MSA_INTERLEAVED_IMPL_LOAD2_STORE2(double, v2f64, v2i64, f64, d, 2)
1242 
1243 #ifdef _MIPSEB
1244 #define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
1245 __extension__ extern __inline void \
1246 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1247 msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1248 { \
1249  _Tpv v0 = msa_ld1q_##suffix(ptr); \
1250  _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
1251  _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
1252  _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0704011F1F1F1F1F, 0x1F1C191613100D0A}), (_Tpvs)v0, (_Tpvs)v1); \
1253  *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150E0B080502, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
1254  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0603001F1F1F1F1F, 0x1E1B1815120F0C09}), (_Tpvs)v0, (_Tpvs)v1); \
1255  *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150D0A070401, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
1256  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x05021F1F1F1F1F1F, 0x1D1A1714110E0B08}), (_Tpvs)v0, (_Tpvs)v1); \
1257  *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x17160F0C09060300, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
1258 }
1259 #else
1260 #define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
1261 __extension__ extern __inline void \
1262 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1263 msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1264 { \
1265  _Tpv v0 = msa_ld1q_##suffix(ptr); \
1266  _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
1267  _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
1268  _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15120F0C09060300, 0x00000000001E1B18}), (_Tpvs)v1, (_Tpvs)v0); \
1269  *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1D1A1714110A0908}), (_Tpvs)v2, v3); \
1270  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1613100D0A070401, 0x00000000001F1C19}), (_Tpvs)v1, (_Tpvs)v0); \
1271  *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1E1B1815120A0908}), (_Tpvs)v2, v3); \
1272  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1714110E0B080502, 0x0000000000001D1A}), (_Tpvs)v1, (_Tpvs)v0); \
1273  *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1F1C191613100908}), (_Tpvs)v2, v3); \
1274 }
1275 #endif
1276 
1277 MSA_INTERLEAVED_IMPL_LOAD3_8(uint8_t, v16u8, v16i8, u8)
1278 MSA_INTERLEAVED_IMPL_LOAD3_8(int8_t, v16i8, v16i8, s8)
1279 
1280 #ifdef _MIPSEB
1281 #define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
1282 __extension__ extern __inline void \
1283 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1284 msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1285 { \
1286  _Tpv v0 = msa_ld1q_##suffix(ptr); \
1287  _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
1288  _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
1289  _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00030000000F000F, 0x000F000C00090006}), (_Tpvs)v1, (_Tpvs)v0); \
1290  *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000A00050002, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
1291  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0002000F000F000F, 0x000E000B00080005}), (_Tpvs)v1, (_Tpvs)v0); \
1292  *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000700040001, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
1293  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000F000F000F, 0x000D000A00070004}), (_Tpvs)v1, (_Tpvs)v0); \
1294  *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000600030000, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
1295 }
1296 #else
1297 #define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
1298 __extension__ extern __inline void \
1299 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1300 msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1301 { \
1302  _Tpv v0 = msa_ld1q_##suffix(ptr); \
1303  _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
1304  _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
1305  _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0009000600030000, 0x00000000000F000C}), (_Tpvs)v1, (_Tpvs)v0); \
1306  *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000D000A00050004}), (_Tpvs)v2, v3); \
1307  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A000700040001, 0x000000000000000D}), (_Tpvs)v1, (_Tpvs)v0); \
1308  *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000E000B00080004}), (_Tpvs)v2, v3); \
1309  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000800050002, 0x000000000000000E}), (_Tpvs)v1, (_Tpvs)v0); \
1310  *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000F000C00090004}), (_Tpvs)v2, v3); \
1311 }
1312 #endif
1313 
1314 MSA_INTERLEAVED_IMPL_LOAD3_16(uint16_t, v8u16, v8i16, u16)
1315 MSA_INTERLEAVED_IMPL_LOAD3_16(int16_t, v8i16, v8i16, s16)
1316 
1317 #define MSA_INTERLEAVED_IMPL_LOAD3_32(_Tp, _Tpv, _Tpvs, suffix) \
1318 __extension__ extern __inline void \
1319 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1320 msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1321 { \
1322  _Tpv v00 = msa_ld1q_##suffix(ptr); \
1323  _Tpv v01 = msa_ld1q_##suffix(ptr + 4); \
1324  _Tpv v02 = msa_ld1q_##suffix(ptr + 8); \
1325  _Tpvs v10 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v01, (v2i64)v01), (_Tpvs)v00); \
1326  _Tpvs v11 = __builtin_msa_ilvr_w((_Tpvs)v02, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v00, (v2i64)v00)); \
1327  _Tpvs v12 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v02, (v2i64)v02), (_Tpvs)v01); \
1328  *a = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v11, (v2i64)v11), v10); \
1329  *b = (_Tpv)__builtin_msa_ilvr_w(v12, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v10, (v2i64)v10)); \
1330  *c = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v12, (v2i64)v12), v11); \
1331 }
1332 
1333 MSA_INTERLEAVED_IMPL_LOAD3_32(uint32_t, v4u32, v4i32, u32)
1334 MSA_INTERLEAVED_IMPL_LOAD3_32(int32_t, v4i32, v4i32, s32)
1335 MSA_INTERLEAVED_IMPL_LOAD3_32(float, v4f32, v4i32, f32)
1336 
1337 #define MSA_INTERLEAVED_IMPL_LOAD3_64(_Tp, _Tpv, suffix) \
1338 __extension__ extern __inline void \
1339 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1340 msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
1341 { \
1342  *((_Tp*)a) = *ptr; *((_Tp*)b) = *(ptr + 1); *((_Tp*)c) = *(ptr + 2); \
1343  *((_Tp*)a + 1) = *(ptr + 3); *((_Tp*)b + 1) = *(ptr + 4); *((_Tp*)c + 1) = *(ptr + 5); \
1344 }
1345 
1346 MSA_INTERLEAVED_IMPL_LOAD3_64(uint64_t, v2u64, u64)
1347 MSA_INTERLEAVED_IMPL_LOAD3_64(int64_t, v2i64, s64)
1348 MSA_INTERLEAVED_IMPL_LOAD3_64(double, v2f64, f64)
1349 
1350 #ifdef _MIPSEB
1351 #define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
1352 __extension__ extern __inline void \
1353 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1354 msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1355 { \
1356  _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0F0E0D0C0B1F1F1F, 0x1F1E1D1C1B1A1F1F}), (_Tpvs)b, (_Tpvs)a); \
1357  _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0D1C140C1B130B1A, 0x1F170F1E160E1D15}), (_Tpvs)c, (_Tpvs)v0); \
1358  msa_st1q_##suffix(ptr, (_Tpv)v1); \
1359  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A09080706051F1F, 0x19181716151F1F1F}), (_Tpvs)b, (_Tpvs)a); \
1360  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1D14071C13061B12, 0x170A1F16091E1508}), (_Tpvs)c, (_Tpvs)v0); \
1361  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
1362  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x04030201001F1F1F, 0x14131211101F1F1F}), (_Tpvs)b, (_Tpvs)a); \
1363  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15021C14011B1300, 0x051F17041E16031D}), (_Tpvs)c, (_Tpvs)v0); \
1364  msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
1365 }
1366 #else
1367 #define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
1368 __extension__ extern __inline void \
1369 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1370 msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1371 { \
1372  _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000050403020100, 0x0000001413121110}), (_Tpvs)b, (_Tpvs)a); \
1373  _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A02110901100800, 0x05140C04130B0312}), (_Tpvs)c, (_Tpvs)v0); \
1374  msa_st1q_##suffix(ptr, (_Tpv)v1); \
1375  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000A09080706, 0x00001A1918171615}), (_Tpvs)b, (_Tpvs)a); \
1376  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x170A011609001508, 0x0D04190C03180B02}), (_Tpvs)c, (_Tpvs)v0); \
1377  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
1378  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000F0E0D0C0B, 0x0000001F1E1D1C1B}), (_Tpvs)b, (_Tpvs)a); \
1379  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x021C09011B08001A, 0x1F0C041E0B031D0A}), (_Tpvs)c, (_Tpvs)v0); \
1380  msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
1381 }
1382 #endif
1383 
1384 MSA_INTERLEAVED_IMPL_STORE3_8(uint8_t, v16u8, v16i8, u8)
1385 MSA_INTERLEAVED_IMPL_STORE3_8(int8_t, v16i8, v16i8, s8)
1386 
1387 #ifdef _MIPSEB
1388 #define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
1389 __extension__ extern __inline void \
1390 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1391 msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1392 { \
1393  _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000700060005000F, 0x000F000E000D000F}), (_Tpvs)b, (_Tpvs)a); \
1394  _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A0006000D0009, 0x000F000B0007000E}), (_Tpvs)c, (_Tpvs)v0); \
1395  msa_st1q_##suffix(ptr, (_Tpv)v1); \
1396  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00040003000F000F, 0x000C000B000A000F}), (_Tpvs)b, (_Tpvs)a); \
1397  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000E000A0003000D, 0x0005000F000B0004}), (_Tpvs)c, (_Tpvs)v0); \
1398  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
1399  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000200010000000F, 0x00090008000F000F}), (_Tpvs)b, (_Tpvs)a); \
1400  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000E00090000, 0x000B0002000F000A}), (_Tpvs)c, (_Tpvs)v0); \
1401  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
1402 }
1403 #else
1404 #define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
1405 __extension__ extern __inline void \
1406 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1407 msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1408 { \
1409  _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000200010000, 0x0000000A00090008}), (_Tpvs)b, (_Tpvs)a); \
1410  _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000800040000, 0x0006000200090005}), (_Tpvs)c, (_Tpvs)v0); \
1411  msa_st1q_##suffix(ptr, (_Tpv)v1); \
1412  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000500040003, 0x00000000000C000B}), (_Tpvs)b, (_Tpvs)a); \
1413  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B00040000000A, 0x0002000C00050001}), (_Tpvs)c, (_Tpvs)v0); \
1414  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
1415  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000000070006, 0x0000000F000E000D}), (_Tpvs)b, (_Tpvs)a); \
1416  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00050000000D0004, 0x000F00060001000E}), (_Tpvs)c, (_Tpvs)v0); \
1417  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
1418 }
1419 #endif
1420 
1421 MSA_INTERLEAVED_IMPL_STORE3_16(uint16_t, v8u16, v8i16, u16)
1422 MSA_INTERLEAVED_IMPL_STORE3_16(int16_t, v8i16, v8i16, s16)
1423 
1424 #ifdef _MIPSEB
1425 #define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
1426 __extension__ extern __inline void \
1427 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1428 msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1429 { \
1430  _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000007, 0x0000000700000006}), (_Tpvs)b, (_Tpvs)a); \
1431  _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000006, 0x0000000700000005}), (_Tpvs)c, (_Tpvs)v0); \
1432  msa_st1q_##suffix(ptr, (_Tpv)v1); \
1433  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000001, 0x0000000500000007}), (_Tpvs)b, (_Tpvs)a); \
1434  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000700000004, 0x0000000500000002}), (_Tpvs)c, (_Tpvs)v0); \
1435  msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
1436  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000007, 0x0000000400000007}), (_Tpvs)b, (_Tpvs)a); \
1437  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000000, 0x0000000100000007}), (_Tpvs)c, (_Tpvs)v0); \
1438  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
1439 }
1440 #else
1441 #define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
1442 __extension__ extern __inline void \
1443 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1444 msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1445 { \
1446  _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000100000000, 0x0000000000000004}), (_Tpvs)b, (_Tpvs)a); \
1447  _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000000, 0x0000000100000004}), (_Tpvs)c, (_Tpvs)v0); \
1448  msa_st1q_##suffix(ptr, (_Tpv)v1); \
1449  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000002, 0x0000000600000005}), (_Tpvs)b, (_Tpvs)a); \
1450  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000002, 0x0000000300000000}), (_Tpvs)c, (_Tpvs)v0); \
1451  msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
1452  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000003, 0x0000000000000007}), (_Tpvs)b, (_Tpvs)a); \
1453  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000006, 0x0000000700000002}), (_Tpvs)c, (_Tpvs)v0); \
1454  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
1455 }
1456 #endif
1457 
1458 MSA_INTERLEAVED_IMPL_STORE3_32(uint32_t, v4u32, v4i32, u32)
1459 MSA_INTERLEAVED_IMPL_STORE3_32(int32_t, v4i32, v4i32, s32)
1460 MSA_INTERLEAVED_IMPL_STORE3_32(float, v4f32, v4i32, f32)
1461 
1462 #define MSA_INTERLEAVED_IMPL_STORE3_64(_Tp, _Tpv, suffix) \
1463 __extension__ extern __inline void \
1464 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1465 msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
1466 { \
1467  *ptr = a[0]; *(ptr + 1) = b[0]; *(ptr + 2) = c[0]; \
1468  *(ptr + 3) = a[1]; *(ptr + 4) = b[1]; *(ptr + 5) = c[1]; \
1469 }
1470 
1471 MSA_INTERLEAVED_IMPL_STORE3_64(uint64_t, v2u64, u64)
1472 MSA_INTERLEAVED_IMPL_STORE3_64(int64_t, v2i64, s64)
1473 MSA_INTERLEAVED_IMPL_STORE3_64(double, v2f64, f64)
1474 
1475 #define MSA_INTERLEAVED_IMPL_LOAD4_STORE4(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
1476 __extension__ extern __inline void \
1477 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1478 msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
1479 { \
1480  _Tpv v0 = msa_ld1q_##suffix(ptr); \
1481  _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
1482  _Tpv v2 = msa_ld1q_##suffix(ptr + nlanes * 2); \
1483  _Tpv v3 = msa_ld1q_##suffix(ptr + nlanes * 3); \
1484  _Tpvs t0 = __builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
1485  _Tpvs t1 = __builtin_msa_pckev_##df((_Tpvs)v3, (_Tpvs)v2); \
1486  _Tpvs t2 = __builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
1487  _Tpvs t3 = __builtin_msa_pckod_##df((_Tpvs)v3, (_Tpvs)v2); \
1488  *a = (_Tpv)__builtin_msa_pckev_##df(t1, t0); \
1489  *b = (_Tpv)__builtin_msa_pckev_##df(t3, t2); \
1490  *c = (_Tpv)__builtin_msa_pckod_##df(t1, t0); \
1491  *d = (_Tpv)__builtin_msa_pckod_##df(t3, t2); \
1492 } \
1493 __extension__ extern __inline void \
1494 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1495 msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
1496 { \
1497  _Tpvs v0 = __builtin_msa_ilvr_##df((_Tpvs)c, (_Tpvs)a); \
1498  _Tpvs v1 = __builtin_msa_ilvr_##df((_Tpvs)d, (_Tpvs)b); \
1499  _Tpvs v2 = __builtin_msa_ilvl_##df((_Tpvs)c, (_Tpvs)a); \
1500  _Tpvs v3 = __builtin_msa_ilvl_##df((_Tpvs)d, (_Tpvs)b); \
1501  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df(v1, v0)); \
1502  msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df(v1, v0)); \
1503  msa_st1q_##suffix(ptr + 2 * nlanes, (_Tpv)__builtin_msa_ilvr_##df(v3, v2)); \
1504  msa_st1q_##suffix(ptr + 3 * nlanes, (_Tpv)__builtin_msa_ilvl_##df(v3, v2)); \
1505 }
1506 
1507 MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint8_t, v16u8, v16i8, u8, b, 16)
1508 MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int8_t, v16i8, v16i8, s8, b, 16)
1509 MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint16_t, v8u16, v8i16, u16, h, 8)
1510 MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int16_t, v8i16, v8i16, s16, h, 8)
1511 MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint32_t, v4u32, v4i32, u32, w, 4)
1512 MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int32_t, v4i32, v4i32, s32, w, 4)
1513 MSA_INTERLEAVED_IMPL_LOAD4_STORE4(float, v4f32, v4i32, f32, w, 4)
1514 
1515 #define MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(_Tp, _Tpv, _Tpvs, suffix) \
1516 __extension__ extern __inline void \
1517 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1518 msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
1519 { \
1520  _Tpv v0 = msa_ld1q_##suffix(ptr); \
1521  _Tpv v1 = msa_ld1q_##suffix(ptr + 2); \
1522  _Tpv v2 = msa_ld1q_##suffix(ptr + 4); \
1523  _Tpv v3 = msa_ld1q_##suffix(ptr + 6); \
1524  *a = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v2, (_Tpvs)v0); \
1525  *b = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v2, (_Tpvs)v0); \
1526  *c = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v3, (_Tpvs)v1); \
1527  *d = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v3, (_Tpvs)v1); \
1528 } \
1529 __extension__ extern __inline void \
1530 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
1531 msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
1532 { \
1533  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)b, (_Tpvs)a)); \
1534  msa_st1q_##suffix(ptr + 2, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)d, (_Tpvs)c)); \
1535  msa_st1q_##suffix(ptr + 4, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)b, (_Tpvs)a)); \
1536  msa_st1q_##suffix(ptr + 6, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)d, (_Tpvs)c)); \
1537 }
1538 
1539 MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(uint64_t, v2u64, v2i64, u64)
1540 MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(int64_t, v2i64, v2i64, s64)
1541 MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(double, v2f64, v2i64, f64)
1542 
1543 __extension__ extern __inline v8i16
1544 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
1545 msa_qdmulhq_n_s16(v8i16 a, int16_t b)
1546 {
1547  v8i16 a_lo, a_hi;
1548  ILVRL_H2_SH(a, msa_dupq_n_s16(0), a_lo, a_hi);
1549  return msa_packr_s32(msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_dupq_n_s32(b)), 1),
1550  msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_hi), msa_dupq_n_s32(b)), 1), 16);
1551 }
1552 
1553 #ifdef __cplusplus
1554 } // extern "C"
1555 #endif
1556 
1557 #endif /*__mips_msa*/
1558 #endif /* OPENCV_CORE_MSA_MACROS_H */