5#ifndef OPENCV_HAL_INTRIN_MSA_HPP 
    6#define OPENCV_HAL_INTRIN_MSA_HPP 
    9#include "opencv2/core/utility.hpp" 
   15CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
   21#define CV_SIMD128_64F 1 
   25    typedef uchar lane_type;
 
   33        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
 
   39        return msa_getq_lane_u8(val, 0);
 
   47    typedef schar lane_type;
 
   55        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
 
   61        return msa_getq_lane_s8(val, 0);
 
   76        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
 
   77        val = msa_ld1q_u16(v);
 
   82        return msa_getq_lane_u16(val, 0);
 
   90    typedef short lane_type;
 
   95    v_int16x8(
short v0, 
short v1, 
short v2, 
short v3, 
short v4, 
short v5, 
short v6, 
short v7)
 
   97        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
 
   98        val = msa_ld1q_s16(v);
 
  103        return msa_getq_lane_s16(val, 0);
 
  111    typedef unsigned int lane_type;
 
  116    v_uint32x4(
unsigned int v0, 
unsigned int v1, 
unsigned int v2, 
unsigned int v3)
 
  118        unsigned int v[] = {v0, v1, v2, v3};
 
  119        val = msa_ld1q_u32(v);
 
  122    unsigned int get0()
 const 
  124        return msa_getq_lane_u32(val, 0);
 
  132    typedef int lane_type;
 
  137    v_int32x4(
int v0, 
int v1, 
int v2, 
int v3)
 
  139        int v[] = {v0, v1, v2, v3};
 
  140        val = msa_ld1q_s32(v);
 
  145        return msa_getq_lane_s32(val, 0);
 
  153    typedef float lane_type;
 
  158    v_float32x4(
float v0, 
float v1, 
float v2, 
float v3)
 
  160        float v[] = {v0, v1, v2, v3};
 
  161        val = msa_ld1q_f32(v);
 
  166        return msa_getq_lane_f32(val, 0);
 
  182        val = msa_ld1q_u64(v);
 
  187        return msa_getq_lane_u64(val, 0);
 
  195    typedef int64 lane_type;
 
  202        int64 v[] = {v0, v1};
 
  203        val = msa_ld1q_s64(v);
 
  208        return msa_getq_lane_s64(val, 0);
 
  216    typedef double lane_type;
 
  223        double v[] = {v0, v1};
 
  224        val = msa_ld1q_f64(v);
 
  229        return msa_getq_lane_f64(val, 0);
 
  235#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \ 
  236inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \ 
  237inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \ 
  238inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \ 
  239inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \ 
  240inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \ 
  241inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \ 
  242inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \ 
  243inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \ 
  244inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \ 
  245inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \ 
  246inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \ 
  247inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); } 
  249OPENCV_HAL_IMPL_MSA_INIT(uint8x16, 
uchar, u8)
 
  250OPENCV_HAL_IMPL_MSA_INIT(int8x16, 
schar, s8)
 
  251OPENCV_HAL_IMPL_MSA_INIT(uint16x8, 
ushort, u16)
 
  252OPENCV_HAL_IMPL_MSA_INIT(int16x8, 
short, s16)
 
  253OPENCV_HAL_IMPL_MSA_INIT(uint32x4, 
unsigned int, u32)
 
  254OPENCV_HAL_IMPL_MSA_INIT(int32x4, 
int, s32)
 
  255OPENCV_HAL_IMPL_MSA_INIT(uint64x2, 
uint64, u64)
 
  256OPENCV_HAL_IMPL_MSA_INIT(int64x2, 
int64, s64)
 
  257OPENCV_HAL_IMPL_MSA_INIT(float32x4, 
float, f32)
 
  258OPENCV_HAL_IMPL_MSA_INIT(float64x2, 
double, f64)
 
  260#define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \ 
  261inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \ 
  263    return _Tpvec(mov(a.val, b.val)); \ 
  265template<int n> inline \ 
  266_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \ 
  268    return _Tpvec(rshr(a.val, b.val, n)); \ 
  280#define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \ 
  281inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ 
  283    hreg a1 = mov(a.val); \ 
  284    msa_st1_##suffix(ptr, a1); \ 
  286template<int n> inline \ 
  287void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ 
  289    hreg a1 = rshr(a.val, n); \ 
  290    msa_st1_##suffix(ptr, a1); \ 
  296OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_int16x8, 
short, v4i16, s16, 
v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32)
 
  297OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_uint32x4, 
unsigned, v2u32, u32, 
v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64)
 
  298OPENCV_HAL_IMPL_MSA_PACK_STORE(
v_int32x4, 
int, v2i32, s32, 
v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64)
 
  305    return v_uint8x16(msa_pack_u16(a.val, b.val));
 
  311    return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val)));
 
  318    v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val));
 
  319    v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val));
 
  328    v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
 
  329    res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
 
  330    res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
 
  331    res = msa_mlaq_lane_f32(res, m3.val, v0, 3);
 
  340    v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
 
  341    res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
 
  342    res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
 
  343    res = msa_addq_f32(res, a.val);
 
  347#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \ 
  348inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ 
  350    return _Tpvec(intrin(a.val, b.val)); \ 
  352inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ 
  354    a.val = intrin(a.val, b.val); \ 
  358OPENCV_HAL_IMPL_MSA_BIN_OP(+, 
v_uint8x16, msa_qaddq_u8)
 
  359OPENCV_HAL_IMPL_MSA_BIN_OP(-, 
v_uint8x16, msa_qsubq_u8)
 
  360OPENCV_HAL_IMPL_MSA_BIN_OP(+, 
v_int8x16, msa_qaddq_s8)
 
  361OPENCV_HAL_IMPL_MSA_BIN_OP(-, 
v_int8x16, msa_qsubq_s8)
 
  362OPENCV_HAL_IMPL_MSA_BIN_OP(+, 
v_uint16x8, msa_qaddq_u16)
 
  363OPENCV_HAL_IMPL_MSA_BIN_OP(-, 
v_uint16x8, msa_qsubq_u16)
 
  364OPENCV_HAL_IMPL_MSA_BIN_OP(+, 
v_int16x8, msa_qaddq_s16)
 
  365OPENCV_HAL_IMPL_MSA_BIN_OP(-, 
v_int16x8, msa_qsubq_s16)
 
  366OPENCV_HAL_IMPL_MSA_BIN_OP(+, 
v_int32x4, msa_addq_s32)
 
  367OPENCV_HAL_IMPL_MSA_BIN_OP(-, 
v_int32x4, msa_subq_s32)
 
  368OPENCV_HAL_IMPL_MSA_BIN_OP(*, 
v_int32x4, msa_mulq_s32)
 
  369OPENCV_HAL_IMPL_MSA_BIN_OP(+, 
v_uint32x4, msa_addq_u32)
 
  370OPENCV_HAL_IMPL_MSA_BIN_OP(-, 
v_uint32x4, msa_subq_u32)
 
  371OPENCV_HAL_IMPL_MSA_BIN_OP(*, 
v_uint32x4, msa_mulq_u32)
 
  372OPENCV_HAL_IMPL_MSA_BIN_OP(+, 
v_float32x4, msa_addq_f32)
 
  373OPENCV_HAL_IMPL_MSA_BIN_OP(-, 
v_float32x4, msa_subq_f32)
 
  374OPENCV_HAL_IMPL_MSA_BIN_OP(*, 
v_float32x4, msa_mulq_f32)
 
  375OPENCV_HAL_IMPL_MSA_BIN_OP(+, 
v_int64x2, msa_addq_s64)
 
  376OPENCV_HAL_IMPL_MSA_BIN_OP(-, 
v_int64x2, msa_subq_s64)
 
  377OPENCV_HAL_IMPL_MSA_BIN_OP(+, 
v_uint64x2, msa_addq_u64)
 
  378OPENCV_HAL_IMPL_MSA_BIN_OP(-, 
v_uint64x2, msa_subq_u64)
 
  379OPENCV_HAL_IMPL_MSA_BIN_OP(/, 
v_float32x4, msa_divq_f32)
 
  380OPENCV_HAL_IMPL_MSA_BIN_OP(+, 
v_float64x2, msa_addq_f64)
 
  381OPENCV_HAL_IMPL_MSA_BIN_OP(-, 
v_float64x2, msa_subq_f64)
 
  382OPENCV_HAL_IMPL_MSA_BIN_OP(*, 
v_float64x2, msa_mulq_f64)
 
  383OPENCV_HAL_IMPL_MSA_BIN_OP(/, 
v_float64x2, msa_divq_f64)
 
  386#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec)         \ 
  387inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \ 
  390    v_mul_expand(a, b, c, d);                                \ 
  391    return v_pack(c, d);                                     \ 
  393inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \ 
  394{a = a * b; return a; } 
  405    v16i8 a_lo, a_hi, b_lo, b_hi;
 
  407    ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi);
 
  408    ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi);
 
  409    c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo));
 
  410    d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi));
 
  416    v16u8 a_lo, a_hi, b_lo, b_hi;
 
  418    ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi);
 
  419    ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi);
 
  420    c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo));
 
  421    d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi));
 
  427    v8i16 a_lo, a_hi, b_lo, b_hi;
 
  429    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
 
  430    ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
 
  431    c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo));
 
  432    d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi));
 
  438    v8u16 a_lo, a_hi, b_lo, b_hi;
 
  440    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
 
  441    ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
 
  442    c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo));
 
  443    d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi));
 
  449    v4u32 a_lo, a_hi, b_lo, b_hi;
 
  451    ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi);
 
  452    ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi);
 
  453    c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo));
 
  454    d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi));
 
  459    v8i16 a_lo, a_hi, b_lo, b_hi;
 
  461    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
 
  462    ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
 
  464    return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)),
 
  465                                   msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16));
 
  470    v8u16 a_lo, a_hi, b_lo, b_hi;
 
  472    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
 
  473    ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
 
  475    return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)),
 
  476                                    msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
 
  483{ 
return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
 
  485{ 
return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
 
  489{ 
return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
 
  491{ 
return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
 
  496    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
 
  497    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
 
  498    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
 
  499    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
 
  500    v4u32 prod   = msa_dotp_u_w(even_a, even_b);
 
  501    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
 
  505    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
 
  506    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
 
  507    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
 
  508    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
 
  509    v4u32 prod   = msa_dpadd_u_w(c.val, even_a, even_b);
 
  510    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
 
  515    v8i16 prod = msa_dotp_s_h(a.val, b.val);
 
  516    return v_int32x4(msa_hadd_s32(prod, prod));
 
  525    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
 
  526    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
 
  527    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
 
  528    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
 
  529    v2u64 prod   = msa_dotp_u_d(even_a, even_b);
 
  530    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
 
  535    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
 
  536    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
 
  537    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
 
  538    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
 
  539    v2u64 prod   = msa_dpadd_u_d(c.val, even_a, even_b);
 
  540    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
 
  545    v4i32 prod = msa_dotp_s_w(a.val, b.val);
 
  546    return v_int64x2(msa_hadd_s64(prod, prod));
 
  598#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \ 
  599OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix)   \ 
  600OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix)   \ 
  601OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix)   \ 
  602inline _Tpvec operator ~ (const _Tpvec& a) \ 
  604    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \ 
  607OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint8x16, v16u8, u8)
 
  608OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int8x16, v16i8, s8)
 
  609OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint16x8, v8u16, u16)
 
  610OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int16x8, v8i16, s16)
 
  611OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint32x4, v4u32, u32)
 
  612OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int32x4, v4i32, s32)
 
  613OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_uint64x2, v2u64, u64)
 
  614OPENCV_HAL_IMPL_MSA_LOGIC_OP(
v_int64x2, v2i64, s64)
 
  616#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \ 
  617inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ 
  619    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \ 
  621inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ 
  623    a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \ 
  627OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
 
  628OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
 
  629OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
 
  633    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
 
  637#define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \ 
  638inline _Tpuvec v_abs(const _Tpsvec& a) \ 
  640    return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \ 
  648#define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \ 
  649inline _Tpvec func(const _Tpvec& a) \ 
  651    return _Tpvec(intrin(a.val)); \ 
  654OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float32x4, v_abs, msa_absq_f32)
 
  655OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float64x2, v_abs, msa_absq_f64)
 
  656OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float32x4, v_sqrt, msa_sqrtq_f32)
 
  658OPENCV_HAL_IMPL_MSA_BASIC_FUNC(
v_float64x2, v_sqrt, msa_sqrtq_f64)
 
  661#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \ 
  662inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ 
  664    return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \ 
  666inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ 
  668    a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \ 
  672OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
 
  673OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
 
  674OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
 
  678    return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
 
  683#define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \ 
  684inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ 
  686    return _Tpvec(intrin(a.val, b.val)); \ 
  689OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_min, msa_minq_u8)
 
  690OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_max, msa_maxq_u8)
 
  691OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_min, msa_minq_s8)
 
  692OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_max, msa_maxq_s8)
 
  693OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_min, msa_minq_u16)
 
  694OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_max, msa_maxq_u16)
 
  695OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_min, msa_minq_s16)
 
  696OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_max, msa_maxq_s16)
 
  697OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint32x4, v_min, msa_minq_u32)
 
  698OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint32x4, v_max, msa_maxq_u32)
 
  699OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int32x4, v_min, msa_minq_s32)
 
  700OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int32x4, v_max, msa_maxq_s32)
 
  701OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float32x4, v_min, msa_minq_f32)
 
  702OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float32x4, v_max, msa_maxq_f32)
 
  703OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float64x2, v_min, msa_minq_f64)
 
  704OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_float64x2, v_max, msa_maxq_f64)
 
  706#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \ 
  707inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ 
  708{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \ 
  709inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ 
  710{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \ 
  711inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ 
  712{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \ 
  713inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ 
  714{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \ 
  715inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ 
  716{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \ 
  717inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ 
  718{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); } 
  720OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint8x16, v16u8, u8, u8)
 
  721OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int8x16, v16i8, s8, u8)
 
  722OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint16x8, v8u16, u16, u16)
 
  723OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int16x8, v8i16, s16, u16)
 
  724OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint32x4, v4u32, u32, u32)
 
  725OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int32x4, v4i32, s32, u32)
 
  726OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_float32x4, v4f32, f32, u32)
 
  727OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_uint64x2, v2u64, u64, u64)
 
  728OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_int64x2, v2i64, s64, u64)
 
  729OPENCV_HAL_IMPL_MSA_INT_CMP_OP(
v_float64x2, v2f64, f64, u64)
 
  732{ 
return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); }
 
  734{ 
return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); }
 
  736OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_add_wrap, msa_addq_u8)
 
  737OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_add_wrap, msa_addq_s8)
 
  738OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_add_wrap, msa_addq_u16)
 
  739OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_add_wrap, msa_addq_s16)
 
  740OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_sub_wrap, msa_subq_u8)
 
  741OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_sub_wrap, msa_subq_s8)
 
  742OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_sub_wrap, msa_subq_u16)
 
  743OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_sub_wrap, msa_subq_s16)
 
  744OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint8x16, v_mul_wrap, msa_mulq_u8)
 
  745OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int8x16, v_mul_wrap, msa_mulq_s8)
 
  746OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_uint16x8, v_mul_wrap, msa_mulq_u16)
 
  747OPENCV_HAL_IMPL_MSA_BIN_FUNC(
v_int16x8, v_mul_wrap, msa_mulq_s16)
 
  759#define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \ 
  760inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \ 
  762    return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \ 
  772    v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
 
  778    return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
 
  783    return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val));
 
  788    return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val));
 
  793    return v_fma(a, b, c);
 
  798    return v_fma(a, b, c);
 
  803    v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
 
  809    return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
 
  814    return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val));
 
  819    return v_fma(a, b, c);
 
  823#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ 
  824inline _Tpvec operator << (const _Tpvec& a, int n) \ 
  825{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \ 
  826inline _Tpvec operator >> (const _Tpvec& a, int n) \ 
  827{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \ 
  828template<int n> inline _Tpvec v_shl(const _Tpvec& a) \ 
  829{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \ 
  830template<int n> inline _Tpvec v_shr(const _Tpvec& a) \ 
  831{ return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \ 
  832template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \ 
  833{ return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); } 
  837OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_uint16x8, u16, 
short, s16)
 
  838OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_int16x8, s16, 
short, s16)
 
  839OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_uint32x4, u32, 
int, s32)
 
  840OPENCV_HAL_IMPL_MSA_SHIFT_OP(
v_int32x4, s32, 
int, s32)
 
  845#define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \ 
  846template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \ 
  848    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \ 
  850template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \ 
  852    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \ 
  854template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ 
  858template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ 
  860    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \ 
  862template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ 
  864    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \ 
  866template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \ 
  872OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint8x16, v16u8, v16i8, s8)
 
  873OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int8x16, v16i8, v16i8, s8)
 
  874OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint16x8, v8u16, v8i16, s16)
 
  875OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int16x8, v8i16, v8i16, s16)
 
  876OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint32x4, v4u32, v4i32, s32)
 
  877OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int32x4, v4i32, v4i32, s32)
 
  878OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_float32x4, v4f32, v4i32, s32)
 
  879OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_uint64x2, v2u64, v2i64, s64)
 
  880OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_int64x2, v2i64, v2i64, s64)
 
  881OPENCV_HAL_IMPL_MSA_ROTATE_OP(
v_float64x2, v2f64, v2i64, s64)
 
  883#define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ 
  884inline _Tpvec v_load(const _Tp* ptr) \ 
  885{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \ 
  886inline _Tpvec v_load_aligned(const _Tp* ptr) \ 
  887{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \ 
  888inline _Tpvec v_load_low(const _Tp* ptr) \ 
  889{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \ 
  890inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ 
  891{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \ 
  892inline void v_store(_Tp* ptr, const _Tpvec& a) \ 
  893{ msa_st1q_##suffix(ptr, a.val); } \ 
  894inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ 
  895{ msa_st1q_##suffix(ptr, a.val); } \ 
  896inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ 
  897{ msa_st1q_##suffix(ptr, a.val); } \ 
  898inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode ) \ 
  899{ msa_st1q_##suffix(ptr, a.val); } \ 
  900inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ 
  902    int n  = _Tpvec::nlanes; \ 
  903    for( int i = 0; i < (n/2); i++ ) \ 
  906inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ 
  908    int n  = _Tpvec::nlanes; \ 
  909    for( int i = 0; i < (n/2); i++ ) \ 
  910        ptr[i] = a.val[i+(n/2)]; \ 
  916OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_int16x8, 
short, s16)
 
  917OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_uint32x4, 
unsigned, u32)
 
  918OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_int32x4, 
int, s32)
 
  921OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_float32x4, 
float, f32)
 
  922OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(
v_float64x2, 
double, f64)
 
  928    v_uint8x16 c = 
v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
 
  933{ 
return v_reinterpret_as_s8(
v_reverse(v_reinterpret_as_u8(a))); }
 
  937    v_uint16x8 c = 
v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
 
  942{ 
return v_reinterpret_as_s16(
v_reverse(v_reinterpret_as_u16(a))); }
 
  955{ 
return v_reinterpret_as_s32(
v_reverse(v_reinterpret_as_u32(a))); }
 
  958{ 
return v_reinterpret_as_f32(
v_reverse(v_reinterpret_as_u32(a))); }
 
  969{ 
return v_reinterpret_as_s64(
v_reverse(v_reinterpret_as_u64(a))); }
 
  972{ 
return v_reinterpret_as_f64(
v_reverse(v_reinterpret_as_u64(a))); }
 
  975#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \ 
  976inline unsigned short v_reduce_##func(const v_uint16x8& a) \ 
  979    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \ 
  980    v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \ 
  982    ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \ 
  983    v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \ 
  984    return (unsigned short)cfunc(c[0], c[1]); \ 
  987OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(max, 
std::max)
 
  988OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(min, 
std::min)
 
  990#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \ 
  991inline short v_reduce_##func(const v_int16x8& a) \ 
  994    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \ 
  995    v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \ 
  997    ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \ 
  998    v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \ 
  999    return (short)cfunc(c[0], c[1]); \ 
 1002OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(max, 
std::max)
 
 1003OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(min, 
std::min)
 
 1005#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \ 
 1006inline scalartype v_reduce_##func(const _Tpvec& a) \ 
 1008    return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \ 
 1019#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(_Tpvec, scalartype, _Tpvec2, func) \ 
 1020inline scalartype v_reduce_##func(const _Tpvec& a) \ 
 1023    v_expand(a, a1, a2); \ 
 1024    return (scalartype)v_reduce_##func(v_##func(a1, a2)); \ 
 1034#define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \ 
 1035inline scalartype v_reduce_sum(const _Tpvec& a) \ 
 1037    return (scalartype)msa_sum_##suffix(a.val); \ 
 1040OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_uint8x16, 
unsigned short, u8)
 
 1041OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_int8x16, 
short, s8)
 
 1042OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_uint16x8, 
unsigned, u16)
 
 1043OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_int16x8, 
int, s16)
 
 1044OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_uint32x4, uint64_t, u32)
 
 1045OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_int32x4, int64_t, s32)
 
 1046OPENCV_HAL_IMPL_MSA_REDUCE_SUM(
v_float32x4, 
float, f32)
 
 1049{ 
return (
uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); }
 
 1051{ 
return (
int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); }
 
 1054    return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1);
 
 1061    v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))),
 
 1062                            MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val)))); 
 
 1063    v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))),
 
 1064                            MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val)))); 
 
 1066    return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))),
 
 1067                                    MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0)))));
 
 1072    v16u8 t0 = msa_abdq_u8(a.val, b.val);
 
 1073    v8u16 t1 = msa_paddlq_u8(t0);
 
 1074    v4u32 t2 = msa_paddlq_u16(t1);
 
 1075    return msa_sum_u32(t2);
 
 1079    v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val));
 
 1080    v8u16 t1 = msa_paddlq_u8(t0);
 
 1081    v4u32 t2 = msa_paddlq_u16(t1);
 
 1082    return msa_sum_u32(t2);
 
 1086    v8u16 t0 = msa_abdq_u16(a.val, b.val);
 
 1087    v4u32 t1 = msa_paddlq_u16(t0);
 
 1088    return msa_sum_u32(t1);
 
 1092    v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val));
 
 1093    v4u32 t1 = msa_paddlq_u16(t0);
 
 1094    return msa_sum_u32(t1);
 
 1098    v4u32 t0 = msa_abdq_u32(a.val, b.val);
 
 1099    return msa_sum_u32(t0);
 
 1103    v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val));
 
 1104    return msa_sum_u32(t0);
 
 1108    v4f32 t0 = msa_abdq_f32(a.val, b.val);
 
 1109    return msa_sum_f32(t0);
 
 1113#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \ 
 1114inline v_uint8x16 v_popcount(const _Tpvec& a) \ 
 1116    v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \ 
 1117    return v_uint8x16(t); \ 
 1119OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(
v_uint8x16)
 
 1120OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(
v_int8x16)
 
 1122#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \ 
 1123inline v_uint16x8 v_popcount(const _Tpvec& a) \ 
 1125    v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \ 
 1126    return v_uint16x8(t); \ 
 1128OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(
v_uint16x8)
 
 1129OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(
v_int16x8)
 
 1131#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \ 
 1132inline v_uint32x4 v_popcount(const _Tpvec& a) \ 
 1134    v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \ 
 1135    return v_uint32x4(t); \ 
 1137OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(
v_uint32x4)
 
 1138OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(
v_int32x4)
 
 1140#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \ 
 1141inline v_uint64x2 v_popcount(const _Tpvec& a) \ 
 1143    v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \ 
 1144    return v_uint64x2(t); \ 
 1146OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(
v_uint64x2)
 
 1147OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(
v_int64x2)
 
 1151    v8i8 m0 = msa_create_s8(
CV_BIG_UINT(0x0706050403020100));
 
 1152    v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0));
 
 1153    v8u16 v1 = msa_paddlq_u8(v0);
 
 1154    v4u32 v2 = msa_paddlq_u16(v1);
 
 1155    v2u64 v3 = msa_paddlq_u32(v2);
 
 1156    return (
int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8);
 
 1159{ 
return v_signmask(v_reinterpret_as_u8(a)); }
 
 1163    v4i16 m0 = msa_create_s16(
CV_BIG_UINT(0x0003000200010000));
 
 1164    v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0));
 
 1165    v4u32 v1 = msa_paddlq_u16(v0);
 
 1166    v2u64 v2 = msa_paddlq_u32(v1);
 
 1167    return (
int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4);
 
 1170{ 
return v_signmask(v_reinterpret_as_u16(a)); }
 
 1174    v2i32 m0 = msa_create_s32(
CV_BIG_UINT(0x0000000100000000));
 
 1175    v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0));
 
 1176    v2u64 v1 = msa_paddlq_u32(v0);
 
 1177    return (
int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2);
 
 1180{ 
return v_signmask(v_reinterpret_as_u32(a)); }
 
 1182{ 
return v_signmask(v_reinterpret_as_u32(a)); }
 
 1186    v2u64 v0 = msa_shrq_n_u64(a.val, 63);
 
 1187    return (
int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1);
 
 1190{ 
return v_signmask(v_reinterpret_as_u64(a)); }
 
 1192{ 
return v_signmask(v_reinterpret_as_u64(a)); }
 
 1205#define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \ 
 1206inline bool v_check_all(const v_##_Tpvec& a) \ 
 1208    _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \ 
 1209    v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \ 
 1210    return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \ 
 1212inline bool v_check_any(const v_##_Tpvec& a) \ 
 1214    _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \ 
 1215    v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \ 
 1216    return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \ 
 1219OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7)
 
 1220OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15)
 
 1221OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31)
 
 1222OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63)
 
 1252#define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \ 
 1253inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ 
 1255    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \ 
 1256                  MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \ 
 1259OPENCV_HAL_IMPL_MSA_SELECT(
v_uint8x16, v16u8, v16u8)
 
 1260OPENCV_HAL_IMPL_MSA_SELECT(
v_int8x16, v16i8, v16u8)
 
 1261OPENCV_HAL_IMPL_MSA_SELECT(
v_uint16x8, v8u16, v16u8)
 
 1262OPENCV_HAL_IMPL_MSA_SELECT(
v_int16x8, v8i16, v16u8)
 
 1263OPENCV_HAL_IMPL_MSA_SELECT(
v_uint32x4, v4u32, v16u8)
 
 1264OPENCV_HAL_IMPL_MSA_SELECT(
v_int32x4, v4i32, v16u8)
 
 1265OPENCV_HAL_IMPL_MSA_SELECT(
v_float32x4, v4f32, v16u8)
 
 1266OPENCV_HAL_IMPL_MSA_SELECT(
v_float64x2, v2f64, v16u8)
 
 1268#define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \ 
 1269inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ 
 1271    _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ 
 1272    _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ 
 1273    b0.val = msa_paddlq_##suffix(a_lo); \ 
 1274    b1.val = msa_paddlq_##suffix(a_hi); \ 
 1276inline _Tpwvec v_expand_low(const _Tpvec& a) \ 
 1278    _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ 
 1279    return _Tpwvec(msa_paddlq_##suffix(a_lo)); \ 
 1281inline _Tpwvec v_expand_high(const _Tpvec& a) \ 
 1283    _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ 
 1284    return _Tpwvec(msa_paddlq_##suffix(a_hi)); \ 
 1286inline _Tpwvec v_load_expand(const _Tp* ptr) \ 
 1288    return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \ 
 1300    return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]});
 
 1305    return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]});
 
 1309#define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \ 
 1310inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ 
 1312    b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ 
 1313    b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ 
 1315inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ 
 1317    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \ 
 1319inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ 
 1321    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \ 
 1323inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ 
 1325    c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \ 
 1326    d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \ 
 1329OPENCV_HAL_IMPL_MSA_UNPACKS(
v_uint8x16, v16u8, v16i8, s8)
 
 1330OPENCV_HAL_IMPL_MSA_UNPACKS(
v_int8x16, v16i8, v16i8, s8)
 
 1331OPENCV_HAL_IMPL_MSA_UNPACKS(
v_uint16x8, v8u16, v8i16, s16)
 
 1332OPENCV_HAL_IMPL_MSA_UNPACKS(
v_int16x8, v8i16, v8i16, s16)
 
 1333OPENCV_HAL_IMPL_MSA_UNPACKS(
v_uint32x4, v4u32, v4i32, s32)
 
 1334OPENCV_HAL_IMPL_MSA_UNPACKS(
v_int32x4, v4i32, v4i32, s32)
 
 1335OPENCV_HAL_IMPL_MSA_UNPACKS(
v_float32x4, v4f32, v4i32, s32)
 
 1336OPENCV_HAL_IMPL_MSA_UNPACKS(
v_float64x2, v2f64, v2i64, s64)
 
 1339#define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \ 
 1341inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \ 
 1343    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \ 
 1346OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint8x16, v16u8, v16i8, s8)
 
 1347OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int8x16, v16i8, v16i8, s8)
 
 1348OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint16x8, v8u16, v8i16, s16)
 
 1349OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int16x8, v8i16, v8i16, s16)
 
 1350OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint32x4, v4u32, v4i32, s32)
 
 1351OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int32x4, v4i32, v4i32, s32)
 
 1352OPENCV_HAL_IMPL_MSA_EXTRACT(
v_uint64x2, v2u64, v2i64, s64)
 
 1353OPENCV_HAL_IMPL_MSA_EXTRACT(
v_int64x2, v2i64, v2i64, s64)
 
 1354OPENCV_HAL_IMPL_MSA_EXTRACT(
v_float32x4, v4f32, v4i32, s32)
 
 1355OPENCV_HAL_IMPL_MSA_EXTRACT(
v_float64x2, v2f64, v2i64, s64)
 
 1360    return v_int32x4(msa_cvttintq_s32_f32(a.val));
 
 1365    v4i32 a1 = msa_cvttintq_s32_f32(a.val);
 
 1366    return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val))));
 
 1371    v4i32 a1 = msa_cvttintq_s32_f32(a.val);
 
 1372    return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1)))));
 
 1377    return v_int32x4(msa_cvttruncq_s32_f32(a.val));
 
 1382    return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0)));
 
 1387    return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val)));
 
 1392    v2f64 a1 = msa_cvtrintq_f64(a.val);
 
 1393    return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0)));
 
 1398    v2f64 a1 = msa_cvtrintq_f64(a.val);
 
 1399    return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0)));
 
 1404    return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0)));
 
 1407#define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \ 
 1408inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ 
 1409                           const _Tpvec& a2, const _Tpvec& a3, \ 
 1410                           _Tpvec& b0, _Tpvec& b1, \ 
 1411                           _Tpvec& b2, _Tpvec& b3) \ 
 1413    _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ 
 1414    _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ 
 1415    _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \ 
 1416    _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \ 
 1417    b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \ 
 1418    b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \ 
 1419    b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \ 
 1420    b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \ 
 1423OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(
v_uint32x4, v4u32, v4i32, s32)
 
 1424OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(
v_int32x4, v4i32, v4i32, s32)
 
 1425OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(
v_float32x4, v4f32, v4i32, s32)
 
 1427#define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \ 
 1428inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \ 
 1430    msa_ld2q_##suffix(ptr, &a.val, &b.val); \ 
 1432inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ 
 1434    msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \ 
 1436inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ 
 1437                                v_##_Tpvec& c, v_##_Tpvec& d) \ 
 1439    msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \ 
 1441inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ 
 1442                                hal::StoreMode =hal::STORE_UNALIGNED) \ 
 1444    msa_st2q_##suffix(ptr, a.val, b.val); \ 
 1446inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ 
 1447                                const v_##_Tpvec& c, hal::StoreMode =hal::STORE_UNALIGNED) \ 
 1449    msa_st3q_##suffix(ptr, a.val, b.val, c.val); \ 
 1451inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ 
 1452                                const v_##_Tpvec& c, const v_##_Tpvec& d, \ 
 1453                                hal::StoreMode =hal::STORE_UNALIGNED ) \ 
 1455    msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \ 
 1458OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, 
uchar, u8)
 
 1459OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, 
schar, s8)
 
 1460OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, 
ushort, u16)
 
 1461OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8, 
short, s16)
 
 1462OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4, 
unsigned, u32)
 
 1463OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4, 
int, s32)
 
 1464OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4, 
float, f32)
 
 1465OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, 
uint64, u64)
 
 1466OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, 
int64, s64)
 
 1467OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2, 
double, f64)
 
 1477    return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f)));
 
 1482    return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val));
 
 1487    return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
 
 1492    return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
 
 1617    return v_int16x8(msa_combine_s16(msa_ld1_s16(tab + 
idx[0]), msa_ld1_s16(tab + 
idx[1])));
 
 1636    return v_int32x4(msa_combine_s32(msa_ld1_s32(tab + 
idx[0]), msa_ld1_s32(tab + 
idx[1])));
 
 1648    return v_int64x2(msa_combine_s64(msa_create_s64(tab[
idx[0]]), msa_create_s64(tab[
idx[1]])));
 
 1654inline v_uint64x2 v_lut(
const uint64_t* tab, 
const int* 
idx) { 
return v_reinterpret_as_u64(
v_lut((
const int64_t *)tab, 
idx)); }
 
 1675    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems)));
 
 1694        tab[msa_getq_lane_s32(idxvec.val, 0)],
 
 1695        tab[msa_getq_lane_s32(idxvec.val, 1)],
 
 1696        tab[msa_getq_lane_s32(idxvec.val, 2)],
 
 1697        tab[msa_getq_lane_s32(idxvec.val, 3)]
 
 1715    v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + 
idx[0]), msa_ld1_f32(tab + 
idx[2]));
 
 1716    v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + 
idx[1]), msa_ld1_f32(tab + 
idx[3]));
 
 1717    x = 
v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
 
 1718    y = 
v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
 
 1723    v_int8x16 c = 
v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val));
 
 1730    v_int8x16 c = 
v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val));
 
 1737    v_int16x8 c = 
v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val));
 
 1745    v_int16x8 c = 
v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val));
 
 1754    c.val[0] = vec.val[0];
 
 1755    c.val[1] = vec.val[2];
 
 1756    c.val[2] = vec.val[1];
 
 1757    c.val[3] = vec.val[3];
 
 1766    v_int8x16 c = 
v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val));
 
 1774    v_int16x8 c = 
v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val));
 
 1811    v2f64 xy0 = msa_ld1q_f64(tab + 
idx[0]);
 
 1812    v2f64 xy1 = msa_ld1q_f64(tab + 
idx[1]);
 
 1813    x = 
v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
 
 1814    y = 
v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
 
 1817template<
int i, typename _Tp>
 
 1818inline typename _Tp::lane_type 
v_extract_n(const _Tp& a)
 
 1820    return v_rotate_right<i>(a).get0();
 
 1826    return v_setall_u32(v_extract_n<i>(a));
 
 1831    return v_setall_s32(v_extract_n<i>(a));
 
 1836    return v_setall_f32(v_extract_n<i>(a));
 
 1844    v4f16 v = (v4f16)msa_ld1_s16((
const short*)ptr);
 
 1846    v4f16 v = msa_ld1_f16((
const __fp16*)ptr);
 
 1853    v4f16 hv = msa_cvt_f16_f32(v.val);
 
 1856    msa_st1_s16((
short*)ptr, (int16x4_t)hv);
 
 1858    msa_st1_f16((__fp16*)ptr, hv);
 
 1865    for( 
int i = 0; i < 4; i++ )
 
 1866        buf[i] = (
float)ptr[i];
 
 1874    for( 
int i = 0; i < 4; i++ )
 
 1875        ptr[i] = (hfloat)buf[i];
 
 1881CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
const int * idx
Definition core_c.h:668
const CvArr CvArr * x
Definition core_c.h:1195
const CvArr * y
Definition core_c.h:1187
signed char schar
Definition interface.h:48
#define CV_BIG_UINT(n)
Definition interface.h:64
uint32_t uint
Definition interface.h:42
unsigned char uchar
Definition interface.h:51
int64_t int64
Definition interface.h:61
unsigned short ushort
Definition interface.h:52
uint64_t uint64
Definition interface.h:62
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424
v_reg< schar, 16 > v_int8x16
Sixteen 8-bit signed integer values.
Definition intrin_cpp.hpp:491
v_reg< uchar, 16 > v_uint8x16
Sixteen 8-bit unsigned integer values.
Definition intrin_cpp.hpp:489
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392
v_reg< int64, 2 > v_int64x2
Two 64-bit signed integer values.
Definition intrin_cpp.hpp:507
void v_store(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory.
Definition intrin_cpp.hpp:2190
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462
v_reg< ushort, 8 > v_uint16x8
Eight 16-bit unsigned integer values.
Definition intrin_cpp.hpp:493
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343
v_reg< typename V_TypeTraits< _Tp >::w_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::w_type)> v_load_expand(const _Tp *ptr)
Load register contents from memory with double expand.
Definition intrin_cpp.hpp:1872
v_reg< int, 4 > v_int32x4
Four 32-bit signed integer values.
Definition intrin_cpp.hpp:499
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057
v_reg< _Tp, n > v_sqr_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Square of the magnitude.
Definition intrin_cpp.hpp:1033
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475
v_reg< unsigned, 4 > v_uint32x4
Four 32-bit unsigned integer values.
Definition intrin_cpp.hpp:497
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007
v_reg< _Tp, n > v_magnitude(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Magnitude.
Definition intrin_cpp.hpp:1020
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_load(const _Tp *ptr)
Load register contents from memory.
Definition intrin_cpp.hpp:1584
CV_INLINE v_reg< _Tp, n > operator~(const v_reg< _Tp, n > &a)
Bitwise NOT.
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573
v_reg< typename V_TypeTraits< _Tp >::q_type, simd128_width/sizeof(typename V_TypeTraits< _Tp >::q_type)> v_load_expand_q(const _Tp *ptr)
Load register contents from memory with quad expand.
Definition intrin_cpp.hpp:1961
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111
void v_cleanup()
Definition intrin_cpp.hpp:3297
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994
v_reg< uint64, 2 > v_uint64x2
Two 64-bit unsigned integer values.
Definition intrin_cpp.hpp:505
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_quads(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2640
v_reg< float, 4 > v_float32x4
Four 32-bit floating point values (single precision)
Definition intrin_cpp.hpp:501
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut_pairs(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2633
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223
_Tp v_extract_n(const v_reg< _Tp, n > &v)
Vector extract.
Definition intrin_cpp.hpp:2397
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890
void v_store_aligned(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (aligned)
Definition intrin_cpp.hpp:2251
v_reg< short, 8 > v_int16x8
Eight 16-bit signed integer values.
Definition intrin_cpp.hpp:495
v_reg< double, 2 > v_float64x2
Two 64-bit floating point values (double precision)
Definition intrin_cpp.hpp:503
#define CV_DECL_ALIGNED(x)
Definition cvdef.h:243
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441