5#ifndef OPENCV_HAL_INTRIN_LSX_HPP 
    6#define OPENCV_HAL_INTRIN_LSX_HPP 
   11#define CV_SIMD128_64F 1 
   12#define CV_SIMD128_FP16 0 
   19CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
   23inline __m128i _v128_setr_b(
char v0, 
char v1, 
char v2, 
char v3, 
char v4, 
char v5, 
char v6,
 
   24        char v7, 
char v8, 
char v9, 
char v10, 
char v11, 
char v12, 
char v13, 
char v14, 
char v15)
 
   26    return (__m128i)v16i8{ v0, v1, v2, v3, v4, v5, v6, v7,
 
   27                           v8, v9, v10, v11, v12, v13, v14, v15 };
 
   30inline __m128i _v128_set_b(
char v0, 
char v1, 
char v2, 
char v3, 
char v4, 
char v5, 
char v6,
 
   31        char v7, 
char v8, 
char v9, 
char v10, 
char v11, 
char v12, 
char v13, 
char v14, 
char v15)
 
   33    return (__m128i)v16i8{ v15, v14, v13, v12, v11, v10, v9, v8,
 
   34                           v7, v6, v5, v4, v3, v2, v1, v0 };
 
   37inline __m128i _v128_setr_h(
short v0, 
short v1, 
short v2, 
short v3, 
short v4, 
short v5,
 
   40    return (__m128i)v8i16{ v0, v1, v2, v3, v4, v5, v6, v7 };
 
   43inline __m128i _v128_setr_w(
int v0, 
int v1, 
int v2, 
int v3)
 
   45    return (__m128i)v4i32{ v0, v1, v2, v3 };
 
   48inline __m128i _v128_set_w(
int v0, 
int v1, 
int v2, 
int v3)
 
   50    return (__m128i)v4i32{ v3, v2, v1, v0 };
 
   53inline __m128i _v128_setall_w(
int v0)
 
   55    return __lsx_vreplgr2vr_w(v0);
 
   60    return (__m128i)v2i64{ v0, v1 };
 
   65    return (__m128i)v2i64{ v1, v0 };
 
   68inline __m128 _v128_setr_ps(
float v0, 
float v1, 
float v2, 
float v3)
 
   70    return (__m128)v4f32{ v0, v1, v2, v3 };
 
   73inline __m128 _v128_setall_ps(
float v0)
 
   75    return (__m128)v4f32{ v0, v0, v0, v0 };
 
   78inline __m128d _v128_setr_pd(
double v0, 
double v1)
 
   80    return (__m128d)v2f64{ v0, v1 };
 
   83inline __m128d _v128_setall_pd(
double v0)
 
   85    return (__m128d)v2f64{ v0, v0 };
 
   88inline __m128i _lsx_packus_h(
const __m128i& a, 
const __m128i& b)
 
   90    return __lsx_vssrarni_bu_h(b, a, 0);
 
   93inline __m128i _lsx_packs_h(
const __m128i& a, 
const __m128i& b)
 
   95    return __lsx_vssrarni_b_h(b, a, 0);
 
   98inline __m128i _lsx_packus_w(
const __m128i& a, 
const __m128i& b)
 
  100    return __lsx_vssrarni_hu_w(b, a, 0);
 
  107    typedef uchar lane_type;
 
  115        val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
 
  120        return (
uchar)__lsx_vpickve2gr_bu(val, 0);
 
  128    typedef schar lane_type;
 
  129    enum { nlanes = 16 };
 
  132    explicit v_int8x16(__m128i v) : val(v) {}
 
  136        val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
 
  141        return (
schar)__lsx_vpickve2gr_b(val, 0);
 
  156        val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
 
  161        return (
ushort)__lsx_vpickve2gr_hu(val, 0);
 
  169    typedef short lane_type;
 
  173    explicit v_int16x8(__m128i v) : val(v) {}
 
  174    v_int16x8(
short v0, 
short v1, 
short v2, 
short v3, 
short v4, 
short v5, 
short v6, 
short v7)
 
  176        val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
 
  181        return (
short)__lsx_vpickve2gr_h(val, 0);
 
  189    typedef unsigned lane_type;
 
  194    v_uint32x4(
unsigned v0, 
unsigned v1, 
unsigned v2, 
unsigned v3)
 
  196        val = _v128_setr_w(v0, v1, v2, v3);
 
  199    unsigned get0()
 const 
  201        return (
unsigned)__lsx_vpickve2gr_wu(val, 0);
 
  209    typedef int lane_type;
 
  213    explicit v_int32x4(__m128i v) : val(v) {}
 
  214    v_int32x4(
int v0, 
int v1, 
int v2, 
int v3)
 
  216        val = _v128_setr_w(v0, v1, v2, v3);
 
  221        return (
int)__lsx_vpickve2gr_w(val, 0);
 
  229    typedef float lane_type;
 
  234    explicit v_float32x4(__m128i v) { val = *((__m128*)&v); }
 
  235    v_float32x4(
float v0, 
float v1, 
float v2, 
float v3)
 
  237        val = _v128_setr_ps(v0, v1, v2, v3);
 
  242        union { 
int iv; 
float fv; } d;
 
  243        d.iv = __lsx_vpickve2gr_w(val, 0);
 
  247    int get0toint()
 const 
  249        __m128i 
result = __lsx_vftintrz_w_s(val);
 
  250        return (
int)__lsx_vpickve2gr_w(
result, 0);
 
  265        val = _v128_setr_d(v0, v1);
 
  270        return __lsx_vpickve2gr_du(val, 0);
 
  278    typedef int64 lane_type;
 
  282    explicit v_int64x2(__m128i v) : val(v) {}
 
  285        val = _v128_setr_d(v0, v1);
 
  290        return __lsx_vpickve2gr_d(val, 0);
 
  298    typedef double lane_type;
 
  303    explicit v_float64x2(__m128i v) { val = *((__m128d*)&v); }
 
  306        val = _v128_setr_pd(v0, v1);
 
  311        union { 
int64 iv; 
double fv; } d;
 
  312        d.iv = __lsx_vpickve2gr_d(val, 0);
 
  316    int64 get0toint64()
 const 
  318        __m128i 
result = __lsx_vftintrz_l_d(val);
 
  327#define OPENCV_HAL_IMPL_LSX_LOADSTORE(_Tpvec, _Tp)                     \ 
  328    inline _Tpvec v_load(const _Tp* ptr)                               \ 
  329    { return _Tpvec(__lsx_vld(ptr, 0)); }                              \ 
  330    inline _Tpvec v_load_aligned(const _Tp* ptr)                       \ 
  331    { return _Tpvec(__lsx_vld(ptr, 0)); }                              \ 
  332    inline _Tpvec v_load_low(const _Tp* ptr)                           \ 
  333    { return _Tpvec(__lsx_vldrepl_d(ptr, 0)); }                        \ 
  334    inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \ 
  336        __m128i vl = __lsx_vldrepl_d(ptr0, 0);                         \ 
  337        __m128i vh = __lsx_vldrepl_d(ptr1, 0);                         \ 
  338        return _Tpvec(__lsx_vilvl_d(vh, vl));                          \ 
  340    inline void v_store(_Tp* ptr, const _Tpvec& a)                     \ 
  341    { __lsx_vst(a.val, ptr, 0); }                                      \ 
  342    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)             \ 
  343    { __lsx_vst(a.val, ptr, 0); }                                      \ 
  344    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)     \ 
  345    { __lsx_vst(a.val, ptr, 0); }                                      \ 
  346    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\ 
  348        if ( mode == hal::STORE_UNALIGNED)                             \ 
  349            __lsx_vst(a.val, ptr, 0);                                  \ 
  350        else if ( mode == hal::STORE_ALIGNED_NOCACHE)                  \ 
  351            __lsx_vst(a.val, ptr, 0);                                  \ 
  353            __lsx_vst(a.val, ptr, 0);                                  \ 
  355    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                 \ 
  356    {  __lsx_vstelm_d(a.val, ptr, 0, 0); }                             \ 
  357    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                \ 
  358    {  __lsx_vstelm_d(a.val, ptr, 0, 1); }                             \ 
  363OPENCV_HAL_IMPL_LSX_LOADSTORE(
v_int16x8,  
short)
 
  364OPENCV_HAL_IMPL_LSX_LOADSTORE(
v_uint32x4,  
unsigned)
 
  365OPENCV_HAL_IMPL_LSX_LOADSTORE(
v_int32x4,   
int)
 
  369#define OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg)        \ 
  370    inline _Tpvec v_load(const _Tp* ptr)                               \ 
  371    { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); }                     \ 
  372    inline _Tpvec v_load_aligned(const _Tp* ptr)                       \ 
  373    { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); }                     \ 
  374    inline _Tpvec v_load_low(const _Tp* ptr)                           \ 
  375    { return _Tpvec((halfreg)__lsx_vldrepl_d(ptr, 0)); }               \ 
  376    inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \ 
  378        __m128i vl = __lsx_vldrepl_d(ptr0, 0);                         \ 
  379        __m128i vh = __lsx_vldrepl_d(ptr1, 0);                         \ 
  380        return _Tpvec((halfreg)__lsx_vilvl_d(vh, vl));                 \ 
  382    inline void v_store(_Tp* ptr, const _Tpvec& a)                     \ 
  383    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \ 
  384    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)             \ 
  385    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \ 
  386    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)     \ 
  387    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \ 
  388    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\ 
  390        if( mode == hal::STORE_UNALIGNED)                              \ 
  391            __lsx_vst((__m128i)a.val, ptr, 0);                         \ 
  392        else if( mode == hal::STORE_ALIGNED_NOCACHE)                   \ 
  393            __lsx_vst((__m128i)a.val, ptr, 0);                         \ 
  395            __lsx_vst((__m128i)a.val, ptr, 0);                         \ 
  397    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                 \ 
  398    {  __lsx_vstelm_d((__m128i)a.val, ptr, 0, 0); }                    \ 
  399    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                \ 
  400    {  __lsx_vstelm_d((__m128i)a.val, ptr, 0, 1); }                    \ 
  402OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(
v_float32x4, 
float, __m128)
 
  403OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(
v_float64x2, 
double, __m128d)
 
  405inline __m128i _lsx_128_castps_si128(
const __m128& v)
 
  406{ 
return __m128i(v); }
 
  408inline __m128i _lsx_128_castpd_si128(
const __m128d& v)
 
  409{ 
return __m128i(v); }
 
  411#define OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, _Tpvecf, suffix, cast)  \ 
  412    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)    \ 
  413    { return _Tpvec(cast(a.val)); } 
  415#define OPENCV_HAL_IMPL_LSX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)           \ 
  416    inline _Tpvec v_setzero_##suffix()                                            \ 
  417    { return _Tpvec(__lsx_vldi(0)); }                                             \ 
  418    inline _Tpvec v_setall_##suffix(_Tp v)                                        \ 
  419    { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); }                    \ 
  420    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,  suffix, OPENCV_HAL_NOP)         \ 
  421    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,   suffix, OPENCV_HAL_NOP)         \ 
  422    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,  suffix, OPENCV_HAL_NOP)         \ 
  423    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8,   suffix, OPENCV_HAL_NOP)         \ 
  424    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4,  suffix, OPENCV_HAL_NOP)         \ 
  425    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4,   suffix, OPENCV_HAL_NOP)         \ 
  426    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2,  suffix, OPENCV_HAL_NOP)         \ 
  427    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2,   suffix, OPENCV_HAL_NOP)         \ 
  428    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float32x4, suffix, _lsx_128_castps_si128)  \ 
  429    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float64x2, suffix, _lsx_128_castpd_si128)  \ 
  434OPENCV_HAL_IMPL_LSX_INIT(
v_int16x8,   
short,    s16,  h,  
int)
 
  435OPENCV_HAL_IMPL_LSX_INIT(
v_uint32x4,  
unsigned, u32,  w,  
int)
 
  436OPENCV_HAL_IMPL_LSX_INIT(
v_int32x4,   
int,      s32,  w,  
int)
 
  440inline __m128 _lsx_128_castsi128_ps(
const __m128i &v)
 
  443inline __m128d _lsx_128_castsi128_pd(
const __m128i &v)
 
  444{ 
return __m128d(v); }
 
  446#define OPENCV_HAL_IMPL_LSX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast)    \ 
  447    inline _Tpvec v_setzero_##suffix()                                      \ 
  448    { return _Tpvec(__lsx_vldi(0)); }                                       \ 
  449    inline _Tpvec v_setall_##suffix(_Tp v)                                  \ 
  450    { return _Tpvec(_v128_setall_##zsuffix(v)); }                           \ 
  451    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,     suffix,   cast)        \ 
  452    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,      suffix,   cast)        \ 
  453    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,     suffix,   cast)        \ 
  454    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8,      suffix,   cast)        \ 
  455    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4,     suffix,   cast)        \ 
  456    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4,      suffix,   cast)        \ 
  457    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2,     suffix,   cast)        \ 
  458    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2,      suffix,   cast)        \ 
  460OPENCV_HAL_IMPL_LSX_INIT_FLT(
v_float32x4, 
float,  f32, ps, _lsx_128_castsi128_ps)
 
  461OPENCV_HAL_IMPL_LSX_INIT_FLT(
v_float64x2, 
double, f64, pd, _lsx_128_castsi128_pd)
 
  466{ 
return v_float32x4(_lsx_128_castps_si128(__m128(a.val))); }
 
  471{ 
return v_float64x2(_lsx_128_castpd_si128(__m128d(a.val))); }
 
  476#define OPENCV_HAL_IMPL_LSX_UNPACK(_Tpvec, suffix)                            \ 
  477    inline _Tpvec v128_unpacklo(const _Tpvec& a, const _Tpvec& b)             \ 
  478    { return _Tpvec(__lsx_vilvl_##suffix(__m128i(b.val), __m128i(a.val))); }  \ 
  479    inline _Tpvec v128_unpackhi(const _Tpvec& a, const _Tpvec& b)             \ 
  480    { return _Tpvec(__lsx_vilvh_##suffix(__m128i(b.val), __m128i(a.val))); }  \ 
  494#define OPENCV_HAL_IMPL_LSX_ZIP(_Tpvec)                               \ 
  495    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)     \ 
  496    { return (_Tpvec)__lsx_vilvl_d((__m128i)b.val, (__m128i)a.val); } \ 
  497    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)    \ 
  498    { return (_Tpvec)__lsx_vilvh_d((__m128i)b.val, (__m128i)a.val); } \ 
  499    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,         \ 
  500                            _Tpvec& c, _Tpvec& d)                     \ 
  502        __m128i a1 = (__m128i)a.val,  b1 = (__m128i)b.val;            \ 
  503        c = _Tpvec(__lsx_vilvl_d(b1, a1));                            \ 
  504        d = _Tpvec(__lsx_vilvh_d(b1, a1));                            \ 
  506    inline void v_zip(const _Tpvec& a, const _Tpvec& b,               \ 
  507                      _Tpvec& ab0, _Tpvec& ab1)                       \ 
  509        ab0 = v128_unpacklo(a, b);                                    \ 
  510        ab1 = v128_unpackhi(a, b);                                    \ 
  527#define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin)           \ 
  528    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \ 
  529    { return _Tpvec(intrin(a.val, b.val)); }                          \ 
  530    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \ 
  531    { a.val = intrin(a.val, b.val); return a; } 
  533OPENCV_HAL_IMPL_LSX_BIN_OP(+, 
v_uint8x16,  __lsx_vsadd_bu)
 
  534OPENCV_HAL_IMPL_LSX_BIN_OP(-, 
v_uint8x16,  __lsx_vssub_bu)
 
  535OPENCV_HAL_IMPL_LSX_BIN_OP(+, 
v_int8x16,   __lsx_vsadd_b)
 
  536OPENCV_HAL_IMPL_LSX_BIN_OP(-, 
v_int8x16,   __lsx_vssub_b)
 
  537OPENCV_HAL_IMPL_LSX_BIN_OP(+, 
v_uint16x8,  __lsx_vsadd_hu)
 
  538OPENCV_HAL_IMPL_LSX_BIN_OP(-, 
v_uint16x8,  __lsx_vssub_hu)
 
  539OPENCV_HAL_IMPL_LSX_BIN_OP(+, 
v_int16x8,   __lsx_vsadd_h)
 
  540OPENCV_HAL_IMPL_LSX_BIN_OP(-, 
v_int16x8,   __lsx_vssub_h)
 
  541OPENCV_HAL_IMPL_LSX_BIN_OP(+, 
v_uint32x4,  __lsx_vadd_w)
 
  542OPENCV_HAL_IMPL_LSX_BIN_OP(-, 
v_uint32x4,  __lsx_vsub_w)
 
  543OPENCV_HAL_IMPL_LSX_BIN_OP(*, 
v_uint32x4,  __lsx_vmul_w)
 
  544OPENCV_HAL_IMPL_LSX_BIN_OP(+, 
v_int32x4,   __lsx_vadd_w)
 
  545OPENCV_HAL_IMPL_LSX_BIN_OP(-, 
v_int32x4,   __lsx_vsub_w)
 
  546OPENCV_HAL_IMPL_LSX_BIN_OP(*, 
v_int32x4,   __lsx_vmul_w)
 
  547OPENCV_HAL_IMPL_LSX_BIN_OP(+, 
v_uint64x2,  __lsx_vadd_d)
 
  548OPENCV_HAL_IMPL_LSX_BIN_OP(-, 
v_uint64x2,  __lsx_vsub_d)
 
  549OPENCV_HAL_IMPL_LSX_BIN_OP(+, 
v_int64x2,   __lsx_vadd_d)
 
  550OPENCV_HAL_IMPL_LSX_BIN_OP(-, 
v_int64x2,   __lsx_vsub_d)
 
  552OPENCV_HAL_IMPL_LSX_BIN_OP(+, 
v_float32x4, __lsx_vfadd_s)
 
  553OPENCV_HAL_IMPL_LSX_BIN_OP(-, 
v_float32x4, __lsx_vfsub_s)
 
  554OPENCV_HAL_IMPL_LSX_BIN_OP(*, 
v_float32x4, __lsx_vfmul_s)
 
  555OPENCV_HAL_IMPL_LSX_BIN_OP(/, 
v_float32x4, __lsx_vfdiv_s)
 
  556OPENCV_HAL_IMPL_LSX_BIN_OP(+, 
v_float64x2, __lsx_vfadd_d)
 
  557OPENCV_HAL_IMPL_LSX_BIN_OP(-, 
v_float64x2, __lsx_vfsub_d)
 
  558OPENCV_HAL_IMPL_LSX_BIN_OP(*, 
v_float64x2, __lsx_vfmul_d)
 
  559OPENCV_HAL_IMPL_LSX_BIN_OP(/, 
v_float64x2, __lsx_vfdiv_d)
 
  576    __m128i a0 = a.val, b0 = b.val;
 
  577    __m128i pev = __lsx_vmulwev_w_hu(a0, b0);
 
  578    __m128i pod = __lsx_vmulwod_w_hu(a0, b0);
 
  579    __m128i pl  = __lsx_vilvl_w(pod, pev);
 
  580    __m128i ph  = __lsx_vilvh_w(pod, pev);
 
  581    return (
v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
 
  585    __m128i a0 = a.val, b0 = b.val;
 
  586    __m128i pev = __lsx_vmulwev_w_h(a0, b0);
 
  587    __m128i pod = __lsx_vmulwod_w_h(a0, b0);
 
  588    __m128i pl  = __lsx_vilvl_w(pod, pev);
 
  589    __m128i ph  = __lsx_vilvh_w(pod, pev);
 
  590    return (
v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
 
  593{ a = a * b; 
return a; }
 
  595{ a = a * b; 
return a; }
 
  597{ a = a * b; 
return a; }
 
  599{ a = a * b; 
return a; }
 
  603#define OPENCV_HAL_IMPL_LSX_BIN_FUNC(func, _Tpvec, intrin)         \ 
  604    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)           \ 
  605    { return _Tpvec(intrin(a.val, b.val)); }                       \ 
  607OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, 
v_uint8x16,  __lsx_vadd_b)
 
  608OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, 
v_int8x16,   __lsx_vadd_b)
 
  609OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, 
v_uint16x8,  __lsx_vadd_h)
 
  610OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, 
v_int16x8,   __lsx_vadd_h)
 
  611OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, 
v_uint8x16,  __lsx_vsub_b)
 
  612OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, 
v_int8x16,   __lsx_vsub_b)
 
  613OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, 
v_uint16x8,  __lsx_vsub_h)
 
  614OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, 
v_int16x8,   __lsx_vsub_h)
 
  615OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, 
v_uint16x8,  __lsx_vmul_h)
 
  616OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, 
v_int16x8,   __lsx_vmul_h)
 
  620    __m128i a0 = a.val, b0 = b.val;
 
  621    __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
 
  622    __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
 
  628    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
 
  635    __m128i a0 = a.val, b0 = b.val;
 
  636    __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
 
  637    __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
 
  638    c.val = __lsx_vilvl_h(p1, p0);
 
  639    d.val = __lsx_vilvh_h(p1, p0);
 
  644    __m128i a0 = a.val, b0 = b.val;
 
  645    __m128i p0 = __lsx_vmulwev_h_b(a0, b0);
 
  646    __m128i p1 = __lsx_vmulwod_h_b(a0, b0);
 
  647    c.val = __lsx_vilvl_h(p1, p0);
 
  648    d.val = __lsx_vilvh_h(p1, p0);
 
  653    __m128i a0 = a.val, b0 = b.val;
 
  654    __m128i p0 = __lsx_vmulwev_w_h(a0, b0);
 
  655    __m128i p1 = __lsx_vmulwod_w_h(a0, b0);
 
  656    c.val = __lsx_vilvl_w(p1, p0);
 
  657    d.val = __lsx_vilvh_w(p1, p0);
 
  662    __m128i a0 = a.val, b0 = b.val;
 
  663    __m128i p0 = __lsx_vmulwev_w_hu(a0, b0);
 
  664    __m128i p1 = __lsx_vmulwod_w_hu(a0, b0);
 
  665    c.val = __lsx_vilvl_w(p1, p0);
 
  666    d.val = __lsx_vilvh_w(p1, p0);
 
  671    __m128i a0 = a.val, b0 = b.val;
 
  672    __m128i p0 = __lsx_vmulwev_d_wu(a0, b0);
 
  673    __m128i p1 = __lsx_vmulwod_d_wu(a0, b0);
 
  674    c.val = __lsx_vilvl_d(p1, p0);
 
  675    d.val = __lsx_vilvh_d(p1, p0);
 
  678{ 
return v_int16x8(__lsx_vmuh_h(a.val, b.val)); }
 
  680{ 
return v_uint16x8(__lsx_vmuh_hu(a.val, b.val)); }
 
  683#define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                 \ 
  684    inline _Tpuvec operator << (const _Tpuvec& a, int imm)                           \ 
  685    { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \ 
  686    inline _Tpsvec operator << (const _Tpsvec& a, int imm)                           \ 
  687    { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \ 
  688    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)                           \ 
  689    { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \ 
  690    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)                           \ 
  691    { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); }                 \ 
  693    inline _Tpuvec v_shl(const _Tpuvec& a)                                           \ 
  694    { return _Tpuvec(__lsx_vslli_##suffix(a.val, imm)); }                            \ 
  696    inline _Tpsvec v_shl(const _Tpsvec& a)                                           \ 
  697    { return _Tpsvec(__lsx_vslli_##suffix(a.val, imm)); }                            \ 
  699    inline _Tpuvec v_shr(const _Tpuvec& a)                                           \ 
  700    { return _Tpuvec(__lsx_vsrli_##suffix(a.val, imm)); }                            \ 
  702    inline _Tpsvec v_shr(const _Tpsvec& a)                                           \ 
  703    { return _Tpsvec(__lsx_vsrai_##suffix(a.val, imm)); }                            \ 
  710#define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix)                                 \ 
  711    OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix)                       \ 
  712    OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix)                        \ 
  713    OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix)                       \ 
  714    inline _Tpvec operator ~(const _Tpvec& a)                                        \ 
  715    { return _Tpvec(__lsx_vnori_b(a.val, 0)); }                                      \ 
  718OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int8x16,    v)
 
  720OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int16x8,    v)
 
  722OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int32x4,    v)
 
  724OPENCV_HAL_IMPL_LSX_LOGIC_OP(
v_int64x2,    v)
 
  726#define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)               \ 
  727    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)                 \ 
  728    { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }                   \ 
  729    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)                   \ 
  730    { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val);                          \ 
  734#define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast)                             \ 
  735    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast)                  \ 
  736    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast)                   \ 
  737    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast)                  \ 
  738    inline _Tpvec operator ~ (const _Tpvec& a)                                       \ 
  739    { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); }                           \ 
  741OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(
v_float32x4, _lsx_128_castsi128_ps)
 
  742OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(
v_float64x2, _lsx_128_castsi128_pd)
 
  745#define OPENCV_HAL_IMPL_LSX_SELECT(_Tpvec)                                           \ 
  746    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b)     \ 
  747    { return _Tpvec(__lsx_vbitsel_v(b.val, a.val, mask.val)); }                      \ 
  757{ 
return v_float32x4(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)
mask.val)); }
 
  759{ 
return v_float64x2(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)
mask.val)); }
 
  762#define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec)                            \ 
  763    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)         \ 
  764    { return ~( a == b ); }                                              \ 
  765    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)         \ 
  767    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)         \ 
  768    { return ~(a < b); }                                                 \ 
  769    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)         \ 
  772#define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)    \ 
  773    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)          \ 
  774    { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \ 
  775    inline _Tpuvec operator >  (const _Tpuvec& a, const _Tpuvec& b)          \ 
  776    { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); }                  \ 
  777    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)          \ 
  778    { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \ 
  779    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)           \ 
  780    { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); }                   \ 
  781    OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec)                                   \ 
  782    OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec) 
  788#define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix)          \ 
  789    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)  \ 
  790    { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); }         \ 
  791    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \ 
  792    { return ~(a == b); } 
  794OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(
v_uint64x2, d)
 
  795OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(
v_int64x2, d)
 
  797#define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)       \ 
  798    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)       \ 
  799    { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); }           \ 
  801#define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix)                    \ 
  802    OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix)            \ 
  803    OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix)            \ 
  804    OPENCV_HAL_IMPL_LSX_CMP_FLT(<,  vfcmp_clt, _Tpvec, ssuffix)            \ 
  805    OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix)            \ 
  811{ 
return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
 
  814{ 
return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
 
  817{ 
return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
 
  820{ 
return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
 
  823{ 
return v_float32x4(__lsx_vfcmp_cor_s(a.val, a.val)); }
 
  826{ 
return v_float64x2(__lsx_vfcmp_cor_d(a.val, a.val)); }
 
  829OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, 
v_uint8x16,  __lsx_vmin_bu)
 
  830OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, 
v_uint8x16,  __lsx_vmax_bu)
 
  831OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, 
v_int8x16,   __lsx_vmin_b)
 
  832OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, 
v_int8x16,   __lsx_vmax_b)
 
  833OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, 
v_uint16x8,  __lsx_vmin_hu)
 
  834OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, 
v_uint16x8,  __lsx_vmax_hu)
 
  835OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, 
v_int16x8,   __lsx_vmin_h)
 
  836OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, 
v_int16x8,   __lsx_vmax_h)
 
  837OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, 
v_uint32x4,  __lsx_vmin_wu)
 
  838OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, 
v_uint32x4,  __lsx_vmax_wu)
 
  839OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, 
v_int32x4,   __lsx_vmin_w)
 
  840OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, 
v_int32x4,   __lsx_vmax_w)
 
  841OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, 
v_float32x4, __lsx_vfmin_s)
 
  842OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, 
v_float32x4, __lsx_vfmax_s)
 
  843OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, 
v_float64x2, __lsx_vfmin_d)
 
  844OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, 
v_float64x2, __lsx_vfmax_d)
 
  847    bool is_invalid = ((imm < 0) || (imm > 16)),
 
  848    bool is_first = (imm == 0),
 
  849    bool is_half = (imm == 8),
 
  850    bool is_second = (imm == 16),
 
  851    bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
 
  852class v_lsx_palignr_u8_class;
 
  855class v_lsx_palignr_u8_class<imm, true, false, false, false, false>;
 
  858class v_lsx_palignr_u8_class<imm, false, true, false, false, false>
 
  861    inline __m128i operator()(
const __m128i& a, 
const __m128i& b)
 const 
  869class v_lsx_palignr_u8_class<imm, false, false, true, false, false>
 
  872    inline __m128i operator()(
const __m128i& a, 
const __m128i& b)
 const 
  874        return __lsx_vshuf4i_d(a, b, 0x9);
 
  879class v_lsx_palignr_u8_class<imm, false, false, false, true, false>
 
  882    inline __m128i operator()(
const __m128i& a, 
const __m128i& b)
 const 
  890class v_lsx_palignr_u8_class<imm, false, false, false, false, true>
 
  893    inline __m128i operator()(
const __m128i& a, 
const __m128i& b)
 const 
  895        enum { imm2 = (
sizeof(__m128i) - imm) };
 
  896        return __lsx_vor_v(__lsx_vbsrl_v(a, imm), __lsx_vbsll_v(b, imm2));
 
  901inline __m128i v_lsx_palignr_u8(
const __m128i& a, 
const __m128i& b)
 
  903    CV_StaticAssert((imm >= 0) && (imm <= 16), 
"Invalid imm for v_lsx_palignr_u8");
 
  904    return v_lsx_palignr_u8_class<imm>()(a, b);
 
  907#define OPENCV_HAL_IMPL_LSX_ROTATE_CAST(_Tpvec, cast)                                   \ 
  909    inline _Tpvec v_rotate_right(const _Tpvec &a)                                       \ 
  911        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \ 
  912        __m128i ret = __lsx_vbsrl_v((__m128i)a.val, imm2);                              \ 
  913        return _Tpvec(cast(ret));                                                       \ 
  916    inline _Tpvec v_rotate_left(const _Tpvec &a)                                        \ 
  918        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \ 
  919        __m128i ret = __lsx_vbsll_v((__m128i)a.val, imm2);                              \ 
  920        return _Tpvec(cast(ret));                                                       \ 
  923    inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                      \ 
  925        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \ 
  926        return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)a.val, (__m128i)b.val)));    \ 
  929    inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                       \ 
  931        enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type))};   \ 
  932        return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)b.val, (__m128i)a.val)));    \ 
  935OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint8x16, OPENCV_HAL_NOP)                             \
 
  936OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int8x16,  OPENCV_HAL_NOP)                             \
 
  937OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint16x8, OPENCV_HAL_NOP)                             \
 
  938OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int16x8,  OPENCV_HAL_NOP)                             \
 
  939OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint32x4, OPENCV_HAL_NOP)                             \
 
  940OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int32x4,  OPENCV_HAL_NOP)                             \
 
  941OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_uint64x2, OPENCV_HAL_NOP)                             \
 
  942OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_int64x2,  OPENCV_HAL_NOP)                             \
 
  944OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_float32x4, _lsx_128_castsi128_ps)
 
  945OPENCV_HAL_IMPL_LSX_ROTATE_CAST(
v_float64x2, _lsx_128_castsi128_pd)
 
  950    __m128i vec = __lsx_vshuf4i_b(a.val, 0x1B);
 
  951    return v_uint8x16(__lsx_vshuf4i_w(vec, 0x1B));
 
  955{ 
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
 
  959    __m128i vec = __lsx_vshuf4i_h(a.val, 0x1B);
 
  960    return v_uint16x8(__lsx_vshuf4i_w(vec, 0x4E));
 
  964{ 
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
 
  967{ 
return v_uint32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
 
  970{ 
return v_int32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
 
  973{ 
return v_uint64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
 
  976{ 
return v_int64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
 
  979{ 
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
 
  982{ 
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
 
  990    __m128i t1 = __lsx_vhaddw_hu_bu(a.val, a.val);
 
  991    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
 
  992    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
 
  993    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
 
  994    return (
unsigned)__lsx_vpickve2gr_w(t4, 0);
 
  999    __m128i t1 = __lsx_vhaddw_h_b(a.val, a.val);
 
 1000    __m128i t2 = __lsx_vhaddw_w_h(t1, t1);
 
 1001    __m128i t3 = __lsx_vhaddw_d_w(t2, t2);
 
 1002    __m128i t4 = __lsx_vhaddw_q_d(t3, t3);
 
 1003    return (
int)__lsx_vpickve2gr_w(t4, 0);
 
 1006#define OPENCV_HAL_IMPL_LSX_REDUCE_16(_Tpvec, sctype, func, intrin)            \ 
 1007    inline sctype v_reduce_##func(const _Tpvec& a)                             \ 
 1009        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \ 
 1010        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \ 
 1011        val = intrin(val, __lsx_vbsrl_v(val, 2));                              \ 
 1012        val = intrin(val, __lsx_vbsrl_v(val, 1));                              \ 
 1013        return (sctype)__lsx_vpickve2gr_b(val, 0);                             \ 
 1018OPENCV_HAL_IMPL_LSX_REDUCE_16(
v_int8x16,  
schar, min, __lsx_vmin_b)
 
 1019OPENCV_HAL_IMPL_LSX_REDUCE_16(
v_int8x16,  
schar, max, __lsx_vmax_b)
 
 1021#define OPENCV_HAL_IMPL_LSX_REDUCE_8(_Tpvec, sctype, func, intrin)             \ 
 1022    inline sctype v_reduce_##func(const _Tpvec &a)                             \ 
 1024        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \ 
 1025        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \ 
 1026        val = intrin(val, __lsx_vbsrl_v(val, 2));                              \ 
 1027        return (sctype)__lsx_vpickve2gr_h(val, 0);                             \ 
 1032OPENCV_HAL_IMPL_LSX_REDUCE_8(
v_int16x8,  
short,  min, __lsx_vmin_h)
 
 1033OPENCV_HAL_IMPL_LSX_REDUCE_8(
v_int16x8,  
short,  max, __lsx_vmax_h)
 
 1035#define OPENCV_HAL_IMPL_LSX_REDUCE_4(_Tpvec, sctype, func, intrin)             \ 
 1036    inline sctype v_reduce_##func(const _Tpvec &a)                             \ 
 1038        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \ 
 1039        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \ 
 1040        return (sctype)__lsx_vpickve2gr_w(val, 0);                             \ 
 1043OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_uint32x4, 
unsigned, min, __lsx_vmin_wu)
 
 1044OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_uint32x4, 
unsigned, max, __lsx_vmax_wu)
 
 1045OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_int32x4,  
int,      min, __lsx_vmin_w)
 
 1046OPENCV_HAL_IMPL_LSX_REDUCE_4(
v_int32x4,  
int,      max, __lsx_vmax_w)
 
 1048#define OPENCV_HAL_IMPL_LSX_REDUCE_FLT(func, intrin)                           \ 
 1049    inline float v_reduce_##func(const v_float32x4 &a)                         \ 
 1051        __m128 val   = a.val;                                                  \ 
 1052        val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 8));             \ 
 1053        val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 4));             \ 
 1054        float *fval = (float*)&val;                                            \ 
 1058OPENCV_HAL_IMPL_LSX_REDUCE_FLT(min, __lsx_vfmin_s)
 
 1059OPENCV_HAL_IMPL_LSX_REDUCE_FLT(max, __lsx_vfmax_s)
 
 1063    __m128i t1 = __lsx_vhaddw_d_w(a.val, a.val);
 
 1064    __m128i t2 = __lsx_vhaddw_q_d(t1, t1);
 
 1065    return (
int)__lsx_vpickve2gr_w(t2, 0);
 
 1070    __m128i t1 = __lsx_vhaddw_du_wu(a.val, a.val);
 
 1071    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
 
 1072    return (
int)__lsx_vpickve2gr_w(t2, 0);
 
 1077    __m128i t1 = __lsx_vhaddw_w_h(a.val, a.val);
 
 1078    __m128i t2 = __lsx_vhaddw_d_w(t1, t1);
 
 1079    __m128i t3 = __lsx_vhaddw_q_d(t2, t2);
 
 1080    return (
int)__lsx_vpickve2gr_w(t3, 0);
 
 1085    __m128i t1 = __lsx_vhaddw_wu_hu(a.val, a.val);
 
 1086    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
 
 1087    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
 
 1088    return (
int)__lsx_vpickve2gr_w(t3, 0);
 
 1093    __m128i val = (__m128i)a.val;
 
 1094    val = __lsx_vbsrl_v(val, 8);
 
 1095    __m128 
result = __lsx_vfadd_s(a.val, (__m128)val);
 
 1096    float *pa = (
float*)&
result;
 
 1097    return (
float)(pa[0] + pa[1]);
 
 1102    __m128i t0 = __lsx_vhaddw_qu_du(a.val, a.val);
 
 1103    return (
uint64)__lsx_vpickve2gr_du(t0, 0);
 
 1108    __m128i t0 = __lsx_vhaddw_q_d(a.val, a.val);
 
 1109    return (
int64)__lsx_vpickve2gr_d(t0, 0);
 
 1114    double *pa = (
double*)&a;
 
 1115    return pa[0] + pa[1];
 
 1121    __m128i a0 = (__m128i)a.val;
 
 1122    __m128i b0 = (__m128i)b.val;
 
 1123    __m128i c0 = (__m128i)c.val;
 
 1124    __m128i d0 = (__m128i)d.val;
 
 1125    __m128i ac_l = __lsx_vilvl_w(c0, a0);
 
 1126    __m128i ac_h = __lsx_vilvh_w(c0, a0);
 
 1127    __m128i bd_l = __lsx_vilvl_w(d0, b0);
 
 1128    __m128i bd_h = __lsx_vilvh_w(d0, b0);
 
 1129    __m128  ac   = __lsx_vfadd_s((__m128)ac_l, (__m128)ac_h);
 
 1130    __m128  bd   = __lsx_vfadd_s((__m128)bd_l, (__m128)bd_h);
 
 1131    return v_float32x4(__lsx_vfadd_s((__m128)__lsx_vilvl_w((__m128i)bd, (__m128i)ac),
 
 1132                       (__m128)__lsx_vilvh_w((__m128i)bd, (__m128i)ac)));
 
 1137    __m128i t0 = __lsx_vabsd_b(a.val, b.val);
 
 1138    __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
 
 1139    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
 
 1140    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
 
 1141    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
 
 1142    return (
unsigned)__lsx_vpickve2gr_w(t4, 0);
 
 1147    __m128i t0 = __lsx_vabsd_bu(a.val, b.val);
 
 1148    __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
 
 1149    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
 
 1150    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
 
 1151    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
 
 1152    return (
unsigned)__lsx_vpickve2gr_w(t4, 0);
 
 1157    __m128i t0 = __lsx_vabsd_hu(a.val, b.val);
 
 1158    __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
 
 1159    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
 
 1160    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
 
 1161    return (
unsigned)__lsx_vpickve2gr_w(t3, 0);
 
 1166    __m128i t0 = __lsx_vabsd_h(a.val, b.val);
 
 1167    __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
 
 1168    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
 
 1169    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
 
 1170    return (
unsigned)__lsx_vpickve2gr_w(t3, 0);
 
 1175    __m128i t0 = __lsx_vabsd_wu(a.val, b.val);
 
 1176    __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
 
 1177    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
 
 1178    return (
unsigned)__lsx_vpickve2gr_w(t2, 0);
 
 1183    __m128i t0 = __lsx_vabsd_w(a.val, b.val);
 
 1184    __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
 
 1185    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
 
 1186    return (
unsigned)__lsx_vpickve2gr_w(t2, 0);
 
 1196#define OPENCV_HAL_IMPL_LSX_POPCOUNT(_Tpvec, _Tp, suffix)                  \ 
 1197inline _Tpvec v_popcount(const _Tp& a)                                     \ 
 1198{ return _Tpvec(__lsx_vpcnt_##suffix(a.val)); } 
 1210#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt)              \ 
 1211inline tt reinterpret_int(ft x) { union {ft l; tt i;} v; v.l = x; return v.i; } 
 1214OPENCV_HAL_IMPL_REINTERPRET_INT(
ushort, 
short)
 
 1215OPENCV_HAL_IMPL_REINTERPRET_INT(
short, 
short)
 
 1216OPENCV_HAL_IMPL_REINTERPRET_INT(
unsigned, 
int)
 
 1217OPENCV_HAL_IMPL_REINTERPRET_INT(
int, 
int)
 
 1218OPENCV_HAL_IMPL_REINTERPRET_INT(
float, 
int)
 
 1221OPENCV_HAL_IMPL_REINTERPRET_INT(
double, 
int64)
 
 1225    __m128i 
result = __lsx_vmskltz_b(a.val);
 
 1226    return __lsx_vpickve2gr_w(
result, 0);
 
 1229{ 
return v_signmask(v_reinterpret_as_s8(a)) ;}
 
 1233    __m128i 
result = __lsx_vmskltz_h(a.val);
 
 1234    return __lsx_vpickve2gr_w(
result, 0);
 
 1237{ 
return v_signmask(v_reinterpret_as_s16(a)); }
 
 1241    __m128i 
result = __lsx_vmskltz_w(a.val);
 
 1242    return __lsx_vpickve2gr_w(
result, 0);
 
 1245{ 
return v_signmask(v_reinterpret_as_u32(a)); }
 
 1249    __m128i 
result = __lsx_vmskltz_d(a.val);
 
 1250    return __lsx_vpickve2gr_w(
result, 0);
 
 1253{ 
return v_signmask(v_reinterpret_as_u64(a)); }
 
 1273#define OPENCV_HAL_IMPL_LSX_CHECK(_Tpvec, allmask) \ 
 1274    inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \ 
 1275    inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; } 
 1277OPENCV_HAL_IMPL_LSX_CHECK(
v_int8x16, 65535)
 
 1279OPENCV_HAL_IMPL_LSX_CHECK(
v_int16x8, 255);
 
 1290#define OPENCV_HAL_IMPL_LSX_MULADD(_Tpvec, suffix)                              \ 
 1291    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)      \ 
 1292    { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); }              \ 
 1293    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec &b, const _Tpvec& c)   \ 
 1294    { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); }              \ 
 1295    inline _Tpvec v_sqrt(const _Tpvec& x)                                       \ 
 1296    { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); }                            \ 
 1297    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \ 
 1298    { return v_fma(a, a, b * b); }                                              \ 
 1299    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \ 
 1300    { return v_sqrt(v_fma(a, a, b * b)); } 
 1306{ 
return v_int32x4(__lsx_vmadd_w(c.val, a.val, b.val)); }
 
 1309{ 
return v_fma(a, b, c); }
 
 1322#define OPENCV_HAL_IMPL_LSX_ABS(_Tpvec, suffix)                          \ 
 1323    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)                        \ 
 1324    { return v_u##_Tpvec(__lsx_vabsd_##suffix(x.val, __lsx_vldi(0))); } 
 1326OPENCV_HAL_IMPL_LSX_ABS(int8x16, b)
 
 1327OPENCV_HAL_IMPL_LSX_ABS(int16x8, h)
 
 1328OPENCV_HAL_IMPL_LSX_ABS(int32x4, w)
 
 1331{ 
return v_float32x4(*((__m128i*)&
x) & __lsx_vreplgr2vr_w(0x7fffffff)); }
 
 1333{ 
return v_float64x2(*((__m128i*)&
x) & __lsx_vreplgr2vr_d(0x7fffffffffffffff)); }
 
 1338{ 
return (
v_uint8x16)__lsx_vabsd_bu(a.val, b.val); }
 
 1340{ 
return (
v_uint16x8)__lsx_vabsd_hu(a.val, b.val); }
 
 1342{ 
return (
v_uint32x4)__lsx_vabsd_wu(a.val, b.val); }
 
 1345{ 
return (
v_uint8x16)__lsx_vabsd_b(a.val, b.val); }
 
 1347{ 
return (
v_uint16x8)__lsx_vabsd_h(a.val, b.val); }
 
 1349{ 
return (
v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
 
 1352{ 
return v_abs(a - b); }
 
 1355{ 
return v_abs(a - b); }
 
 1365{ 
return v_max(a, b) - v_min(a, b); }
 
 1371{ 
return v_int32x4(__lsx_vftint_w_s(a.val)); }
 
 1374{ 
return v_int32x4(__lsx_vftint_w_d(a.val, a.val)); }
 
 1377{ 
return v_int32x4(__lsx_vftint_w_d(b.val, a.val)); }
 
 1380{ 
return v_int32x4(__lsx_vftintrz_w_s(a.val)); }
 
 1383{ 
return v_int32x4(__lsx_vftintrz_w_d(a.val, a.val)); }
 
 1386{ 
return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrm_s(a.val)))); }
 
 1392{ 
return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrp_s(a.val)))); }
 
 1402{ 
return v_float32x4(__lsx_vfcvt_s_d(a.val, a.val)); }
 
 1405{ 
return v_float32x4(__lsx_vfcvt_s_d(b.val, a.val)); }
 
 1429                     tab[
idx[14]], tab[
idx[15]]));
 
 1434    return v_int8x16(_v128_setr_h(*(
const short*)(tab + 
idx[0]), *(
const short*)(tab + 
idx[1]),
 
 1435           *(
const short*)(tab + 
idx[2]), *(
const short*)(tab + 
idx[3]), *(
const short*)(tab + 
idx[4]),
 
 1436           *(
const short*)(tab + 
idx[5]), *(
const short*)(tab + 
idx[6]), *(
const short*)(tab + 
idx[7])));
 
 1441    return v_int8x16(_v128_setr_w(*(
const int*)(tab + 
idx[0]), *(
const int*)(tab + 
idx[1]),
 
 1442                *(
const int*)(tab + 
idx[2]), *(
const int*)(tab + 
idx[3])));
 
 1446{ 
return v_reinterpret_as_u8(
v_lut((
const schar*)tab, 
idx)); }
 
 1459    return v_int16x8(_v128_setr_w(*(
const int*)(tab + 
idx[0]), *(
const int*)(tab + 
idx[1]),
 
 1460                *(
const int*)(tab + 
idx[2]), *(
const int*)(tab + 
idx[3])));
 
 1464    return v_int16x8(_v128_setr_d(*(
const int64_t*)(tab + 
idx[0]), *(
const int64_t*)(tab + 
idx[1])));
 
 1468{ 
return v_reinterpret_as_u16(
v_lut((
const short *)tab, 
idx)); }
 
 1470{ 
return v_reinterpret_as_u16(
v_lut_pairs((
const short *)tab, 
idx)); }
 
 1472{ 
return v_reinterpret_as_u16(
v_lut_quads((
const short *)tab, 
idx)); }
 
 1480    return v_int32x4(_v128_setr_d(*(
const int64_t*)(tab + 
idx[0]), *(
const int64_t*)(tab + 
idx[1])));
 
 1500inline v_uint64x2 v_lut(
const uint64_t* tab, 
const int* 
idx) { 
return v_reinterpret_as_u64(
v_lut((
const int64_t *)tab, 
idx)); }
 
 1509    return v_float32x4((__m128)_v128_setr_pd(*(
const double*)(tab + 
idx[0]), *(
const double*)(tab + 
idx[1])));
 
 1527    int *
idx = (
int*)&idxvec.val;
 
 1533    return v_reinterpret_as_u32(
v_lut((
const int *)tab, idxvec));
 
 1538    const int *
idx = (
const int*)&idxvec.val;
 
 1544    const int *
idx = (
const int*)&idxvec.val;
 
 1550    const int *
idx = (
const int*)&idxvec.val;
 
 1551    __m128i xy0  = __lsx_vld(tab + 
idx[0], 0);
 
 1552    __m128i xy1  = __lsx_vld(tab + 
idx[1], 0);
 
 1553    __m128i xy2  = __lsx_vld(tab + 
idx[2], 0);
 
 1554    __m128i xy3  = __lsx_vld(tab + 
idx[3], 0);
 
 1555    __m128i xy01 = __lsx_vilvl_d(xy1, xy0);
 
 1556    __m128i xy23 = __lsx_vilvl_d(xy3, xy2);
 
 1557    __m128i xxyy02 = __lsx_vilvl_w(xy23, xy01);
 
 1558    __m128i xxyy13 = __lsx_vilvh_w(xy23, xy01);
 
 1559    x = 
v_float32x4((__m128)__lsx_vilvl_w(xxyy13, xxyy02));
 
 1560    y = 
v_float32x4((__m128)__lsx_vilvh_w(xxyy13, xxyy02));
 
 1565    const int* 
idx = (
const int*)&idxvec.val;
 
 1566    __m128i xy0 = __lsx_vld(tab + 
idx[0], 0);
 
 1567    __m128i xy1 = __lsx_vld(tab + 
idx[1], 0);
 
 1574    return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
 
 1575                _v128_setr_d(0x0705060403010200, 0x0f0d0e0c0b090a08)));
 
 1581    return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
 
 1582                _v128_setr_d(0x0703060205010400, 0x0f0b0e0a0d090c08)));
 
 1589    return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
 
 1590                _v128_setr_d(0x0706030205040100, 0x0f0e0b0a0d0c0908)));
 
 1596    return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
 
 1597                _v128_setr_d(0x0b0a030209080100, 0x0f0e07060d0c0504)));
 
 1604    return v_int32x4(__lsx_vshuf4i_w(vec.val, 0xd8));
 
 1614    __m128i zero = __lsx_vldi(0);
 
 1615    return v_int8x16(__lsx_vshuf_b(zero, vec.val,
 
 1616           _v128_set_d(0x1211100f0e0d0c0a, 0x0908060504020100)));
 
 1619{ 
return v_reinterpret_as_u8(
v_pack_triplets(v_reinterpret_as_s8(vec))); }
 
 1623    __m128i zero = __lsx_vldi(0);
 
 1624    return v_int16x8(__lsx_vshuf_b(zero, vec.val,
 
 1625           _v128_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100)));
 
 1628{ 
return v_reinterpret_as_u16(
v_pack_triplets(v_reinterpret_as_s16(vec))); }
 
 1641    __m128i 
x = a.val, 
y = b.val;
 
 1642    return v_int32x4(__lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(
x, 
y), 
x, 
y));
 
 1646    __m128i 
x = a.val, 
y = b.val, z = c.val;
 
 1647    __m128i t = __lsx_vmaddwev_w_h(z, 
x, 
y);
 
 1654    __m128i 
x = a.val, 
y = b.val;
 
 1655    return v_int64x2(__lsx_vmaddwod_d_w(__lsx_vmulwev_d_w(
x, 
y), 
x, 
y));
 
 1659    __m128i 
x = a.val, 
y = b.val, z = c.val;
 
 1660    __m128i t = __lsx_vmaddwev_d_w(z, 
x, 
y);
 
 1667    __m128i 
x = a.val, 
y = b.val;
 
 1668    __m128i even  = __lsx_vmulwev_h_bu(
x, 
y);
 
 1669    __m128i odd   = __lsx_vmulwod_h_bu(
x, 
y);
 
 1670    __m128i prod0 = __lsx_vhaddw_wu_hu(even, even);
 
 1671    __m128i prod1 = __lsx_vhaddw_wu_hu(odd, odd);
 
 1672    return v_uint32x4(__lsx_vadd_w(prod0, prod1));
 
 1680    __m128i 
x = a.val, 
y = b.val;
 
 1681    __m128i even  = __lsx_vmulwev_h_b(
x, 
y);
 
 1682    __m128i odd   = __lsx_vmulwod_h_b(
x, 
y);
 
 1683    __m128i prod0 = __lsx_vhaddw_w_h(even, even);
 
 1684    __m128i prod1 = __lsx_vhaddw_w_h(odd, odd);
 
 1685    return v_int32x4(__lsx_vadd_w(prod0, prod1));
 
 1693    __m128i 
x = a.val, 
y = b.val;
 
 1694    __m128i even  = __lsx_vmulwev_w_hu(
x, 
y);
 
 1695    __m128i odd   = __lsx_vmulwod_w_hu(
x, 
y);
 
 1696    __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
 
 1697    __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
 
 1698    return v_uint64x2(__lsx_vadd_d(prod0, prod1));
 
 1705    __m128i 
x = a.val, 
y = b.val;
 
 1706    __m128i even  = __lsx_vmulwev_w_h(
x, 
y);
 
 1707    __m128i odd   = __lsx_vmulwod_w_h(
x, 
y);
 
 1708    __m128i prod0 = __lsx_vhaddw_d_w(even, even);
 
 1709    __m128i prod1 = __lsx_vhaddw_d_w(odd, odd);
 
 1710    return v_int64x2(__lsx_vadd_d(prod0, prod1));
 
 1750    __m128i 
x = a.val, 
y = b.val;
 
 1751    __m128i even  = __lsx_vmulwev_w_hu(
x, 
y);
 
 1752    __m128i odd   = __lsx_vmulwod_w_hu(
x, 
y);
 
 1753    __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
 
 1754    __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
 
 1755    return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
 
 1762    __m128i 
x = a.val, 
y = b.val;
 
 1763    __m128i prod = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(
x, 
y), 
x, 
y);
 
 1764    __m128i sign = __lsx_vsrai_w(prod, 31);
 
 1765    __m128i lo   = __lsx_vilvl_w(sign, prod);
 
 1766    __m128i hi   = __lsx_vilvh_w(sign, prod);
 
 1781    __m128i 
x = (__m128i)v.val;
 
 1782    __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x0), m0.val);
 
 1783    __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x55), m1.val);
 
 1784    __m128 v2 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0xAA), m2.val);
 
 1785    __m128 v3 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0xFF), m3.val);
 
 1787    return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), __lsx_vfadd_s(v2, v3)));
 
 1793    __m128i 
x = (__m128i)v.val;
 
 1794    __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x0), m0.val);
 
 1795    __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(
x, 0x55), m1.val);
 
 1796    __m128 v2 = __lsx_vfmadd_s((__m128)__lsx_vshuf4i_w(
x, 0xAA), m2.val, a.val);
 
 1798    return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), v2));
 
 1801#define OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(_Tpvec, cast_from, cast_to)                          \ 
 1802    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,                            \ 
 1803                               const _Tpvec& a2, const _Tpvec& a3,                            \ 
 1804                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)                \ 
 1806       __m128i t0 = cast_from(__lsx_vilvl_w(a1.val, a0.val));                                 \ 
 1807       __m128i t1 = cast_from(__lsx_vilvl_w(a3.val, a2.val));                                 \ 
 1808       __m128i t2 = cast_from(__lsx_vilvh_w(a1.val, a0.val));                                 \ 
 1809       __m128i t3 = cast_from(__lsx_vilvh_w(a3.val, a2.val));                                 \ 
 1810       b0.val = cast_to(__lsx_vilvl_d(t1, t0));                                               \ 
 1811       b1.val = cast_to(__lsx_vilvh_d(t1, t0));                                               \ 
 1812       b2.val = cast_to(__lsx_vilvl_d(t3, t2));                                               \ 
 1813       b3.val = cast_to(__lsx_vilvh_d(t3, t2));                                               \ 
 1816OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(
v_uint32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
 
 1817OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(
v_int32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
 
 1823    __m128i vec0 = (__m128i)a0.val, vec1 = (__m128i)a1.val;
 
 1824    __m128i 
vec2 = (__m128i)a2.val, vec3 = (__m128i)a3.val;
 
 1825    __m128i t0 = __lsx_vilvl_w(vec1, vec0);
 
 1826    __m128i t1 = __lsx_vilvl_w(vec3, 
vec2);
 
 1827    __m128i t2 = __lsx_vilvh_w(vec1, vec0);
 
 1828    __m128i t3 = __lsx_vilvh_w(vec3, 
vec2);
 
 1829    b0.val = __m128(__lsx_vilvl_d(t1, t0));
 
 1830    b1.val = __m128(__lsx_vilvh_d(t1, t0));
 
 1831    b2.val = __m128(__lsx_vilvl_d(t3, t2));
 
 1832    b3.val = __m128(__lsx_vilvh_d(t3, t2));
 
 1838#define OPENCV_HAL_IMPL_LSX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin_lo, intrin_hi)     \ 
 1839    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)                \ 
 1841        b0.val = intrin_lo(a.val, 0);                                              \ 
 1842        b1.val = intrin_hi(a.val);                                                 \ 
 1844    inline _Tpwvec v_expand_low(const _Tpvec& a)                                   \ 
 1845    { return _Tpwvec(intrin_lo(a.val, 0)); }                                       \ 
 1846    inline _Tpwvec v_expand_high(const _Tpvec& a)                                  \ 
 1847    { return _Tpwvec(intrin_hi(a.val)); }                                          \ 
 1848    inline _Tpwvec v_load_expand(const _Tp* ptr)                                   \ 
 1850        __m128i a = __lsx_vld(ptr, 0);                                             \ 
 1851        return _Tpwvec(intrin_lo(a, 0));                                           \ 
 1857OPENCV_HAL_IMPL_LSX_EXPAND(
v_int16x8,  
v_int32x4,  
short,     __lsx_vsllwil_w_h,   __lsx_vexth_w_h)
 
 1858OPENCV_HAL_IMPL_LSX_EXPAND(
v_uint32x4, 
v_uint64x2, 
unsigned,  __lsx_vsllwil_du_wu, __lsx_vexth_du_wu)
 
 1859OPENCV_HAL_IMPL_LSX_EXPAND(
v_int32x4,  
v_int64x2,  
int,       __lsx_vsllwil_d_w,   __lsx_vexth_d_w)
 
 1861#define OPENCV_HAL_IMPL_LSX_EXPAND_Q(_Tpvec, _Tp, intrin_lo, intrin_hi)          \ 
 1862    inline _Tpvec v_load_expand_q(const _Tp* ptr)                                \ 
 1864        __m128i a = __lsx_vld(ptr, 0);                                           \ 
 1865        __m128i b = intrin_lo(a, 0);                                             \ 
 1866        return _Tpvec(intrin_hi(b, 0));                                          \ 
 1869OPENCV_HAL_IMPL_LSX_EXPAND_Q(
v_uint32x4, 
uchar, __lsx_vsllwil_hu_bu, __lsx_vsllwil_wu_hu)
 
 1870OPENCV_HAL_IMPL_LSX_EXPAND_Q(
v_int32x4,  
schar, __lsx_vsllwil_h_b,   __lsx_vsllwil_w_h)
 
 1875{ 
return v_int8x16(_lsx_packs_h(a.val, b.val)); }
 
 1878{ 
return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, 0)); }
 
 1881{ 
return v_uint8x16(_lsx_packus_h(a.val, b.val)); }
 
 1892template<
int n> 
inline 
 1894{ 
return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, n)); }
 
 1896template<
int n> 
inline 
 1898{ __lsx_vstelm_d(__lsx_vssrlrni_bu_h(a.val, a.val, n), ptr, 0, 0); }
 
 1900template<
int n> 
inline 
 1902{ 
return v_uint8x16(__lsx_vssrarni_bu_h(b.val, a.val, n)); }
 
 1904template<
int n> 
inline 
 1906{ __lsx_vstelm_d(__lsx_vssrarni_bu_h(a.val, a.val, n), ptr, 0, 0); }
 
 1908template<
int n> 
inline 
 1910{ 
return v_int8x16(__lsx_vssrarni_b_h(b.val, a.val, n)); }
 
 1912template<
int n> 
inline 
 1914{ __lsx_vstelm_d(__lsx_vssrarni_b_h(a.val, a.val, n), ptr, 0, 0); }
 
 1918{ 
return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, 0)); }
 
 1921{ 
return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, 0)); }
 
 1924{ 
return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, 0)); }
 
 1930{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, 0), ptr,  0, 0); }
 
 1933{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
 
 1935template<
int n> 
inline 
 1937{ 
return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, n)); }
 
 1939template<
int n> 
inline 
 1941{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, n), ptr, 0, 0); }
 
 1943template<
int n> 
inline 
 1945{ 
return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, n)); }
 
 1947template<
int n> 
inline 
 1949{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, n), ptr, 0, 0); }
 
 1951template<
int n> 
inline 
 1953{ 
return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, n)); }
 
 1955template<
int n> 
inline 
 1956void v_rshr_pack_store(
short* ptr, 
const v_int32x4& a)
 
 1957{ __lsx_vstelm_d(__lsx_vssrarni_h_w(a.val, a.val, n), ptr, 0, 0); }
 
 1962{ 
return v_uint32x4(__lsx_vpickev_w(b.val, a.val)); }
 
 1965{ 
return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
 
 1968{ __lsx_vstelm_d(__lsx_vshuf4i_w(a.val, 0x08), ptr, 0, 0); }
 
 1971{ 
v_pack_store((
unsigned*)ptr, v_reinterpret_as_u64(a)); }
 
 1973template<
int n> 
inline 
 1975{ 
return v_uint32x4(__lsx_vsrlrni_w_d(b.val, a.val, n)); }
 
 1977template<
int n> 
inline 
 1978void v_rshr_pack_store(
unsigned* ptr, 
const v_uint64x2& a)
 
 1979{ __lsx_vstelm_d(__lsx_vsrlrni_w_d(a.val, a.val, n), ptr, 0, 0); }
 
 1981template<
int n> 
inline 
 1983{ 
return v_int32x4(__lsx_vsrarni_w_d(b.val, a.val, n)); }
 
 1985template<
int n> 
inline 
 1986void v_rshr_pack_store(
int* ptr, 
const v_int64x2& a)
 
 1987{ __lsx_vstelm_d(__lsx_vsrarni_w_d(a.val, a.val, n), ptr, 0, 0); }
 
 1991{ 
return v_uint8x16(__lsx_vssrarni_b_h(b.val, a.val, 0)); }
 
 1996    __m128i ab = __lsx_vssrarni_h_w(b.val, a.val, 0);
 
 1997    __m128i cd = __lsx_vssrarni_h_w(d.val, c.val, 0);
 
 1998    return v_uint8x16(__lsx_vssrarni_b_h(cd, ab, 0));
 
 2005    __m128i ab = __lsx_vssrarni_w_d(b.val, a.val, 0);
 
 2006    __m128i cd = __lsx_vssrarni_w_d(d.val, c.val, 0);
 
 2007    __m128i ef = __lsx_vssrarni_w_d(f.val, e.val, 0);
 
 2008    __m128i gh = __lsx_vssrarni_w_d(h.val, g.val, 0);
 
 2010    __m128i abcd = __lsx_vssrarni_h_w(cd, ab, 0);
 
 2011    __m128i efgh = __lsx_vssrarni_h_w(gh, ef, 0);
 
 2012    return v_uint8x16(__lsx_vssrarni_b_h(efgh, abcd, 0));
 
 2019#define OPENCV_HAL_IMPL_LSX_EXTRACT(_Tpvec)                    \ 
 2021    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)  \ 
 2022    { return v_rotate_right<s>(a, b); } 
 2035#define OPENCV_HAL_IMPL_LSX_EXTRACT_N(_Tpvec, _Twvec, intrin)             \ 
 2037inline _Twvec v_extract_n(const _Tpvec& a)                                \ 
 2038{ return (_Twvec)intrin(a.val, i); } 
 2041OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int8x16,  
schar,   __lsx_vpickve2gr_b)
 
 2043OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int16x8,  
short,   __lsx_vpickve2gr_h)
 
 2044OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_uint32x4, 
uint,    __lsx_vpickve2gr_w)
 
 2045OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int32x4,  
int,     __lsx_vpickve2gr_w)
 
 2047OPENCV_HAL_IMPL_LSX_EXTRACT_N(
v_int64x2,  
int64,   __lsx_vpickve2gr_d)
 
 2052    union { 
uint iv; 
float fv; } d;
 
 2053    d.iv = __lsx_vpickve2gr_w(v.val, i);
 
 2060    union { 
uint64 iv; 
double dv; } d;
 
 2061    d.iv = __lsx_vpickve2gr_d(v.val, i);
 
 2067{ 
return v_uint32x4(__lsx_vreplvei_w(a.val, i)); }
 
 2071{ 
return v_int32x4(__lsx_vreplvei_w(a.val, i)); }
 
 2075{ 
return v_float32x4((__m128)__lsx_vreplvei_w((__m128i)a.val, i)); }
 
 2081    __m128i t0 = __lsx_vld(ptr, 0);
 
 2082    __m128i t1 = __lsx_vld(ptr, 16);
 
 2084    a.val = __lsx_vpickev_b(t1, t0);
 
 2085    b.val = __lsx_vpickod_b(t1, t0);
 
 2090    __m128i t0 = __lsx_vld(ptr, 0);
 
 2091    __m128i t1 = __lsx_vld(ptr, 16);
 
 2092    a.val = __lsx_vpickev_h(t1, t0);
 
 2093    b.val = __lsx_vpickod_h(t1, t0);
 
 2098    __m128i t0 = __lsx_vld(ptr, 0);
 
 2099    __m128i t1 = __lsx_vld(ptr, 16);
 
 2100    a.val = __lsx_vpickev_w(t1, t0);
 
 2101    b.val = __lsx_vpickod_w(t1, t0);
 
 2106    __m128i t0 = __lsx_vld(ptr, 0);
 
 2107    __m128i t1 = __lsx_vld(ptr, 16);
 
 2108    a.val = __lsx_vilvl_d(t1, t0);
 
 2109    b.val = __lsx_vilvh_d(t1, t0);
 
 2114    __m128i t0 = __lsx_vld(ptr, 0);
 
 2115    __m128i t1 = __lsx_vld(ptr, 16);
 
 2116    __m128i t2 = __lsx_vld(ptr, 32);
 
 2117    const __m128i shuff0 = _v128_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
 
 2118    const __m128i shuff1 = _v128_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
 
 2119    __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff0);
 
 2120    __m128i b0 = __lsx_vbitsel_v(t1, t0, shuff1);
 
 2121    __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
 
 2122    const __m128i shuff_a = _v128_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29);
 
 2123    const __m128i shuff_b = _v128_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30);
 
 2124    const __m128i shuff_c = _v128_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31);
 
 2126    a.val = __lsx_vshuf_b(t2, a0, shuff_a);
 
 2127    b.val = __lsx_vshuf_b(t2, b0, shuff_b);
 
 2128    c.val = __lsx_vshuf_b(t2, c0, shuff_c);
 
 2133    __m128i t0 = __lsx_vld(ptr, 0);
 
 2134    __m128i t1 = __lsx_vld(ptr, 16);
 
 2135    __m128i t2 = __lsx_vld(ptr, 32);
 
 2136    const __m128i shuff0 = _v128_setr_h(0, 0, -1, 0, 0, -1, 0, 0);
 
 2137    const __m128i shuff1 = _v128_setr_h(0, -1, 0, 0, -1, 0, 0, -1);
 
 2139    __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff1);
 
 2140    __m128i b0 = __lsx_vbitsel_v(t0, t1, shuff0);
 
 2141    __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
 
 2143    const __m128i shuff_a = _v128_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 20, 21, 26, 27);
 
 2144    const __m128i shuff_b = _v128_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 16, 17, 22, 23, 28, 29);
 
 2145    const __m128i shuff_c = _v128_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31);
 
 2147    a.val = __lsx_vshuf_b(t2, a0, shuff_a);
 
 2148    b.val = __lsx_vshuf_b(t2, b0, shuff_b);
 
 2149    c.val = __lsx_vshuf_b(t2, c0, shuff_c);
 
 2154    __m128i t0 = __lsx_vld(ptr, 0);
 
 2155    __m128i t1 = __lsx_vld(ptr, 16);
 
 2156    __m128i t2 = __lsx_vld(ptr, 32);
 
 2158    __m128i a0 = __lsx_vpermi_w(t1, t0, 0xAC);
 
 2159    __m128i b0 = __lsx_vpermi_w(t1, t0, 0xC5);
 
 2160    __m128i c0 = __lsx_vpermi_w(t1, t0, 0x5A);
 
 2162    a.val = __lsx_vextrins_w(a0, t2, 0x31);
 
 2163    b0    = __lsx_vshuf4i_w(b0, 0x38);
 
 2164    c0    = __lsx_vshuf4i_w(c0, 0x8);
 
 2165    b.val = __lsx_vextrins_w(b0, t2, 0x32);
 
 2166    c.val = __lsx_vpermi_w(t2, c0, 0xC4);
 
 2171    __m128i t0 = __lsx_vld(ptr, 0);
 
 2172    __m128i t1 = __lsx_vld(ptr, 16);
 
 2173    __m128i t2 = __lsx_vld(ptr, 32);
 
 2175    a.val = __lsx_vshuf4i_d(t0, t1, 0xC);
 
 2176    b.val = __lsx_vshuf4i_d(t0, t2, 0x9);
 
 2177    c.val = __lsx_vshuf4i_d(t1, t2, 0xC);
 
 2182    __m128i t0 = __lsx_vld(ptr, 0);
 
 2183    __m128i t1 = __lsx_vld(ptr, 16);
 
 2184    __m128i t2 = __lsx_vld(ptr, 32);
 
 2185    __m128i t3 = __lsx_vld(ptr, 48);
 
 2187    __m128i ac_lo = __lsx_vpickev_b(t1, t0);
 
 2188    __m128i bd_lo = __lsx_vpickod_b(t1, t0);
 
 2189    __m128i ac_hi = __lsx_vpickev_b(t3, t2);
 
 2190    __m128i bd_hi = __lsx_vpickod_b(t3, t2);
 
 2192    a.val = __lsx_vpickev_b(ac_hi, ac_lo);
 
 2193    c.val = __lsx_vpickod_b(ac_hi, ac_lo);
 
 2194    b.val = __lsx_vpickev_b(bd_hi, bd_lo);
 
 2195    d.val = __lsx_vpickod_b(bd_hi, bd_lo);
 
 2200    __m128i t0 = __lsx_vld(ptr, 0);
 
 2201    __m128i t1 = __lsx_vld(ptr, 16);
 
 2202    __m128i t2 = __lsx_vld(ptr, 32);
 
 2203    __m128i t3 = __lsx_vld(ptr, 48);
 
 2205    __m128i ac_lo = __lsx_vpickev_h(t1, t0);
 
 2206    __m128i bd_lo = __lsx_vpickod_h(t1, t0);
 
 2207    __m128i ac_hi = __lsx_vpickev_h(t3, t2);
 
 2208    __m128i bd_hi = __lsx_vpickod_h(t3, t2);
 
 2210    a.val = __lsx_vpickev_h(ac_hi, ac_lo);
 
 2211    c.val = __lsx_vpickod_h(ac_hi, ac_lo);
 
 2212    b.val = __lsx_vpickev_h(bd_hi, bd_lo);
 
 2213    d.val = __lsx_vpickod_h(bd_hi, bd_lo);
 
 2218    __m128i p0 = __lsx_vld(ptr, 0);
 
 2219    __m128i p1 = __lsx_vld(ptr, 16);
 
 2220    __m128i p2 = __lsx_vld(ptr, 32);
 
 2221    __m128i p3 = __lsx_vld(ptr, 48);
 
 2223    __m128i t0 = __lsx_vilvl_w(p1, p0);
 
 2224    __m128i t1 = __lsx_vilvl_w(p3, p2);
 
 2225    __m128i t2 = __lsx_vilvh_w(p1, p0);
 
 2226    __m128i t3 = __lsx_vilvh_w(p3, p2);
 
 2227    a.val = __lsx_vilvl_d(t1, t0);
 
 2228    b.val = __lsx_vilvh_d(t1, t0);
 
 2229    c.val = __lsx_vilvl_d(t3, t2);
 
 2230    d.val = __lsx_vilvh_d(t3, t2);
 
 2235    __m128i t0 = __lsx_vld(ptr, 0);
 
 2236    __m128i t1 = __lsx_vld(ptr, 16);
 
 2237    __m128i t2 = __lsx_vld(ptr, 32);
 
 2238    __m128i t3 = __lsx_vld(ptr, 48);
 
 2240    a.val = __lsx_vilvl_d(t2, t0);
 
 2241    b.val = __lsx_vilvh_d(t2, t0);
 
 2242    c.val = __lsx_vilvl_d(t3, t1);
 
 2243    d.val = __lsx_vilvh_d(t3, t1);
 
 2251    __m128i v0 = __lsx_vilvl_b(b.val, a.val);
 
 2252    __m128i v1 = __lsx_vilvh_b(b.val, a.val);
 
 2254    __lsx_vst(v0, ptr, 0);
 
 2255    __lsx_vst(v1, ptr, 16);
 
 2261    __m128i v0 = __lsx_vilvl_h(b.val, a.val);
 
 2262    __m128i v1 = __lsx_vilvh_h(b.val, a.val);
 
 2264    __lsx_vst(v0, ptr, 0);
 
 2265    __lsx_vst(v1, ptr, 16);
 
 2271    __m128i v0 = __lsx_vilvl_w(b.val, a.val);
 
 2272    __m128i v1 = __lsx_vilvh_w(b.val, a.val);
 
 2274    __lsx_vst(v0, ptr, 0);
 
 2275    __lsx_vst(v1, ptr, 16);
 
 2281    __m128i v0 = __lsx_vilvl_d(b.val, a.val);
 
 2282    __m128i v1 = __lsx_vilvh_d(b.val, a.val);
 
 2284    __lsx_vst(v0, ptr, 0);
 
 2285    __lsx_vst(v1, ptr, 16);
 
 2291    __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
 
 2292    __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
 
 2293    __m128i v_c = c.val;
 
 2294    const __m128i shuff0 = _v128_setr_b(0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10);
 
 2295    const __m128i shuff1 = _v128_setr_b(11, 21, 12, 13, 22, 14, 15, 23, 0, 0, 0, 0, 0, 0, 0, 0);
 
 2296    const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 24, 18, 19, 25, 20, 21);
 
 2297    const __m128i shuff3 = _v128_setr_b(26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31);
 
 2298    __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
 
 2300    __m128i 
dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
 
 2301    __m128i 
dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
 
 2302    __m128i 
dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
 
 2303    dst1 = __lsx_vshuf_b(abc, 
dst1, shuff2);
 
 2305    __lsx_vst(
dst0, ptr, 0);
 
 2306    __lsx_vst(
dst1, ptr, 16);
 
 2307    __lsx_vst(
dst2, ptr, 32);
 
 2313    __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
 
 2314    __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
 
 2315    __m128i v_c = c.val;
 
 2316    const __m128i shuff0 = _v128_setr_b(0, 1, 2, 3, 16, 17, 4, 5, 6, 7, 18, 19, 8, 9, 10, 11);
 
 2317    const __m128i shuff1 = _v128_setr_b(20, 21, 12, 13, 14, 15, 22, 23, 0, 0, 0, 0, 0, 0, 0, 0);
 
 2318    const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 24, 25, 20, 21);
 
 2319    const __m128i shuff3 = _v128_setr_b(6, 7, 26, 27, 8, 9, 10, 11, 28, 29, 12, 13, 14, 15, 30, 31);
 
 2320    __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
 
 2322    __m128i 
dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
 
 2323    __m128i 
dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
 
 2324    __m128i 
dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
 
 2325    dst1 = __lsx_vshuf_b(abc, 
dst1, shuff2);
 
 2327    __lsx_vst(
dst0, ptr, 0);
 
 2328    __lsx_vst(
dst1, ptr, 16);
 
 2329    __lsx_vst(
dst2, ptr, 32);
 
 2335    __m128i v_c = c.val;
 
 2336    __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);  
 
 2337    __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);  
 
 2338    __m128i bc_od = __lsx_vpackod_w(v_c, b.val); 
 
 2340    __m128i 
dst0 = __lsx_vshuf4i_w(ab_lo, 0xB4);  
 
 2341    __m128i 
dst1 = __lsx_vilvl_d(ab_hi, bc_od); 
 
 2342    __m128i 
dst2 = __lsx_vpermi_w(bc_od, ab_hi, 0xE8); 
 
 2344    dst0 = __lsx_vextrins_w(
dst0, v_c, 0x20);
 
 2345    dst2 = __lsx_vextrins_w(
dst2, v_c, 0x2);
 
 2346    __lsx_vst(
dst0, ptr, 0);  
 
 2347    __lsx_vst(
dst1, ptr, 16); 
 
 2348    __lsx_vst(
dst2, ptr, 32); 
 
 2354    __m128i 
dst0 = __lsx_vilvl_d(b.val, a.val);
 
 2355    __m128i 
dst1 = __lsx_vpermi_w(a.val, c.val, 0xE4);
 
 2356    __m128i 
dst2 = __lsx_vilvh_d(c.val, b.val);
 
 2358    __lsx_vst(
dst0, ptr, 0);
 
 2359    __lsx_vst(
dst1, ptr, 16);
 
 2360    __lsx_vst(
dst2, ptr, 32);
 
 2367    __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
 
 2368    __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
 
 2369    __m128i cd_lo = __lsx_vilvl_b(d.val, c.val);
 
 2370    __m128i cd_hi = __lsx_vilvh_b(d.val, c.val);
 
 2372    __m128i 
dst0 = __lsx_vilvl_h(cd_lo, ab_lo);
 
 2373    __m128i 
dst1 = __lsx_vilvh_h(cd_lo, ab_lo);
 
 2374    __m128i 
dst2 = __lsx_vilvl_h(cd_hi, ab_hi);
 
 2375    __m128i 
dst3 = __lsx_vilvh_h(cd_hi, ab_hi);
 
 2377    __lsx_vst(
dst0, ptr, 0);
 
 2378    __lsx_vst(
dst1, ptr, 16);
 
 2379    __lsx_vst(
dst2, ptr, 32);
 
 2380    __lsx_vst(
dst3, ptr, 48);
 
 2387    __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
 
 2388    __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
 
 2389    __m128i cd_lo = __lsx_vilvl_h(d.val, c.val);
 
 2390    __m128i cd_hi = __lsx_vilvh_h(d.val, c.val);
 
 2392    __m128i 
dst0 = __lsx_vilvl_w(cd_lo, ab_lo);
 
 2393    __m128i 
dst1 = __lsx_vilvh_w(cd_lo, ab_lo);
 
 2394    __m128i 
dst2 = __lsx_vilvl_w(cd_hi, ab_hi);
 
 2395    __m128i 
dst3 = __lsx_vilvh_w(cd_hi, ab_hi);
 
 2397    __lsx_vst(
dst0, ptr, 0);
 
 2398    __lsx_vst(
dst1, ptr, 16);
 
 2399    __lsx_vst(
dst2, ptr, 32);
 
 2400    __lsx_vst(
dst3, ptr, 48);
 
 2407    __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);
 
 2408    __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);
 
 2409    __m128i cd_lo = __lsx_vilvl_w(d.val, c.val);
 
 2410    __m128i cd_hi = __lsx_vilvh_w(d.val, c.val);
 
 2412    __m128i 
dst0 = __lsx_vilvl_d(cd_lo, ab_lo);
 
 2413    __m128i 
dst1 = __lsx_vilvh_d(cd_lo, ab_lo);
 
 2414    __m128i 
dst2 = __lsx_vilvl_d(cd_hi, ab_hi);
 
 2415    __m128i 
dst3 = __lsx_vilvh_d(cd_hi, ab_hi);
 
 2417    __lsx_vst(
dst0, ptr, 0);
 
 2418    __lsx_vst(
dst1, ptr, 16);
 
 2419    __lsx_vst(
dst2, ptr, 32);
 
 2420    __lsx_vst(
dst3, ptr, 48);
 
 2427    __m128i 
dst0 = __lsx_vilvl_d(b.val, a.val);
 
 2428    __m128i 
dst2 = __lsx_vilvh_d(b.val, a.val);
 
 2429    __m128i 
dst1 = __lsx_vilvl_d(d.val, c.val);
 
 2430    __m128i 
dst3 = __lsx_vilvh_d(d.val, c.val);
 
 2432    __lsx_vst(
dst0, ptr, 0);
 
 2433    __lsx_vst(
dst1, ptr, 16);
 
 2434    __lsx_vst(
dst2, ptr, 32);
 
 2435    __lsx_vst(
dst3, ptr, 48);
 
 2438#define OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1)  \ 
 2439inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0)                        \ 
 2442    v_load_deinterleave((const _Tp1*)ptr, a1, b1);                                                \ 
 2443    a0 = v_reinterpret_as_##suffix0(a1);                                                          \ 
 2444    b0 = v_reinterpret_as_##suffix0(b1);                                                          \ 
 2446inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0)           \ 
 2448    _Tpvec1 a1, b1, c1;                                                                           \ 
 2449    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1);                                            \ 
 2450    a0 = v_reinterpret_as_##suffix0(a1);                                                          \ 
 2451    b0 = v_reinterpret_as_##suffix0(b1);                                                          \ 
 2452    c0 = v_reinterpret_as_##suffix0(c1);                                                          \ 
 2454inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0,                        \ 
 2455                                _Tpvec0& c0, _Tpvec0& d0)                                         \ 
 2457    _Tpvec1 a1, b1, c1, d1;                                                                       \ 
 2458    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1);                                        \ 
 2459    a0 = v_reinterpret_as_##suffix0(a1);                                                          \ 
 2460    b0 = v_reinterpret_as_##suffix0(b1);                                                          \ 
 2461    c0 = v_reinterpret_as_##suffix0(c1);                                                          \ 
 2462    d0 = v_reinterpret_as_##suffix0(d1);                                                          \ 
 2464inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0,                   \ 
 2465                               hal::StoreMode =hal::STORE_UNALIGNED)                      \ 
 2467    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \ 
 2468    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \ 
 2469    v_store_interleave((_Tp1*)ptr, a1, b1);                                                     \ 
 2471inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0,\ 
 2472                               hal::StoreMode =hal::STORE_UNALIGNED)                      \ 
 2474    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \ 
 2475    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \ 
 2476    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0);                                                  \ 
 2477    v_store_interleave((_Tp1*)ptr, a1, b1, c1);                                                 \ 
 2479inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0,                   \ 
 2480                               const _Tpvec0& c0, const _Tpvec0& d0,                              \ 
 2481                               hal::StoreMode =hal::STORE_UNALIGNED)                      \ 
 2483    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \ 
 2484    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \ 
 2485    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0);                                                  \ 
 2486    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0);                                                  \ 
 2487    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1);                                             \ 
 2504    return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0)));
 
 2507    for (
int i = 0; i < 4; i++)
 
 2508        buf[i] = (
float)ptr[i];
 
 2516    __m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val);
 
 2517    __lsx_vstelm_d(res, ptr, 0, 0);
 
 2521    for (
int i = 0; i < 4; i++)
 
 2522        ptr[i] = hfloat(buf[i]);
 
 2532CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
const int * idx
Definition core_c.h:668
CvArr * dst0
Definition core_c.h:988
const CvArr * vec2
Definition core_c.h:1429
CvArr CvArr CvArr CvArr * dst3
Definition core_c.h:989
CvArr CvArr * dst1
Definition core_c.h:988
const CvArr CvArr * x
Definition core_c.h:1195
CvArr CvArr CvArr * dst2
Definition core_c.h:989
const CvArr const CvArr CvArr * result
Definition core_c.h:1423
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition intrin_cpp.hpp:2216
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, n > v_select(const v_reg< _Tp, n > &mask, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Per-element select (blend operation)
Definition intrin_cpp.hpp:1451
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition intrin_cpp.hpp:2115
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
void v_transpose4x4(v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, const v_reg< _Tp, n > &a2, const v_reg< _Tp, n > &a3, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1, v_reg< _Tp, n > &b2, v_reg< _Tp, n > &b3)
Transpose 4x4 matrix.
Definition intrin_cpp.hpp:2761
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132
StoreMode
Definition intrin.hpp:100
@ STORE_UNALIGNED
Definition intrin.hpp:101
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition dualquaternion.inl.hpp:274