Esterv.Utils.QrCode/intrin__avx512_8hpp_source.html

// This file is part of OpenCV project.

// It is subject to the license terms in the LICENSE file found in the top-level directory

// of this distribution and at http://opencv.org/license.html


#ifndef OPENCV_HAL_INTRIN_AVX512_HPP

#define OPENCV_HAL_INTRIN_AVX512_HPP


#if defined(_MSC_VER) && (_MSC_VER < 1920/*MSVS2019*/)

# pragma warning(disable:4146)  // unary minus operator applied to unsigned type, result still unsigned

# pragma warning(disable:4309)  // 'argument': truncation of constant value

# pragma warning(disable:4310)  // cast truncates constant value

#endif


#define CVT_ROUND_MODES_IMPLEMENTED 0


#define CV_SIMD512 1

#define CV_SIMD512_64F 1

#define CV_SIMD512_FP16 0  // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)


#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))

#define _v512_set_epu32(a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \

        _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \

                         ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))

#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \

                        a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \

        _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \

                        ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \

                        ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \

                        ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))

#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \

                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \

                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \

                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \

        _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \

                        ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \

                        ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \

                        ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \

                        ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \

                        ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \

                        ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \

                        ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))

#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \

                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \

                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \

                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \

        _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \

                       (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \

                       (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \

                       (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \

                       (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \

                       (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \

                       (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \

                       (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))


#ifndef _mm512_cvtpd_pslo

#ifdef _mm512_zextsi256_si512

#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))

#else

//if preferred way to extend with zeros is unavailable

#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))

#endif

#endif


namespace

{


inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi)

{ return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }


inline __m512 _v512_combine(const __m256& lo, const __m256& hi)

{ return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }


inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi)

{ return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }


inline int _v_cvtsi512_si32(const __m512i& a)

{ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }


inline __m256i _v512_extract_high(const __m512i& v)

{ return _mm512_extracti32x8_epi32(v, 1); }


inline __m256  _v512_extract_high(const __m512& v)

{ return _mm512_extractf32x8_ps(v, 1); }


inline __m256d _v512_extract_high(const __m512d& v)

{ return _mm512_extractf64x4_pd(v, 1); }


inline __m256i _v512_extract_low(const __m512i& v)

{ return _mm512_castsi512_si256(v); }


inline __m256  _v512_extract_low(const __m512& v)

{ return _mm512_castps512_ps256(v); }


inline __m256d _v512_extract_low(const __m512d& v)

{ return _mm512_castpd512_pd256(v); }


inline __m512i _v512_insert(const __m512i& a, const __m256i& b)

{ return _mm512_inserti32x8(a, b, 0); }


inline __m512 _v512_insert(const __m512& a, const __m256& b)

{ return _mm512_insertf32x8(a, b, 0); }


inline __m512d _v512_insert(const __m512d& a, const __m256d& b)

{ return _mm512_insertf64x4(a, b, 0); }


}


namespace cv

{


CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN


struct v_uint8x64

{

    typedef uchar lane_type;

    enum { nlanes = 64 };

    __m512i val;


    explicit v_uint8x64(__m512i v) : val(v) {}

    v_uint8x64(uchar v0,  uchar v1,  uchar v2,  uchar v3,

               uchar v4,  uchar v5,  uchar v6,  uchar v7,

               uchar v8,  uchar v9,  uchar v10, uchar v11,

               uchar v12, uchar v13, uchar v14, uchar v15,

               uchar v16, uchar v17, uchar v18, uchar v19,

               uchar v20, uchar v21, uchar v22, uchar v23,

               uchar v24, uchar v25, uchar v26, uchar v27,

               uchar v28, uchar v29, uchar v30, uchar v31,

               uchar v32, uchar v33, uchar v34, uchar v35,

               uchar v36, uchar v37, uchar v38, uchar v39,

               uchar v40, uchar v41, uchar v42, uchar v43,

               uchar v44, uchar v45, uchar v46, uchar v47,

               uchar v48, uchar v49, uchar v50, uchar v51,

               uchar v52, uchar v53, uchar v54, uchar v55,

               uchar v56, uchar v57, uchar v58, uchar v59,

               uchar v60, uchar v61, uchar v62, uchar v63)

    {

        val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,

                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,

                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,

                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);

    }

    v_uint8x64() {}


    static inline v_uint8x64 zero() { return v_uint8x64(_mm512_setzero_si512()); }


    uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }

};


struct v_int8x64

{

    typedef schar lane_type;

    enum { nlanes = 64 };

    __m512i val;


    explicit v_int8x64(__m512i v) : val(v) {}

    v_int8x64(schar v0,  schar v1,  schar v2,  schar v3,

              schar v4,  schar v5,  schar v6,  schar v7,

              schar v8,  schar v9,  schar v10, schar v11,

              schar v12, schar v13, schar v14, schar v15,

              schar v16, schar v17, schar v18, schar v19,

              schar v20, schar v21, schar v22, schar v23,

              schar v24, schar v25, schar v26, schar v27,

              schar v28, schar v29, schar v30, schar v31,

              schar v32, schar v33, schar v34, schar v35,

              schar v36, schar v37, schar v38, schar v39,

              schar v40, schar v41, schar v42, schar v43,

              schar v44, schar v45, schar v46, schar v47,

              schar v48, schar v49, schar v50, schar v51,

              schar v52, schar v53, schar v54, schar v55,

              schar v56, schar v57, schar v58, schar v59,

              schar v60, schar v61, schar v62, schar v63)

    {

        val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,

                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,

                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,

                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);

    }

    v_int8x64() {}


    static inline v_int8x64 zero() { return v_int8x64(_mm512_setzero_si512()); }


    schar get0() const { return (schar)_v_cvtsi512_si32(val); }

};


struct v_uint16x32

{

    typedef ushort lane_type;

    enum { nlanes = 32 };

    __m512i val;


    explicit v_uint16x32(__m512i v) : val(v) {}

    v_uint16x32(ushort v0,  ushort v1,  ushort v2,  ushort v3,

                ushort v4,  ushort v5,  ushort v6,  ushort v7,

                ushort v8,  ushort v9,  ushort v10, ushort v11,

                ushort v12, ushort v13, ushort v14, ushort v15,

                ushort v16, ushort v17, ushort v18, ushort v19,

                ushort v20, ushort v21, ushort v22, ushort v23,

                ushort v24, ushort v25, ushort v26, ushort v27,

                ushort v28, ushort v29, ushort v30, ushort v31)

    {

        val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,

                              v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);

    }

    v_uint16x32() {}


    static inline v_uint16x32 zero() { return v_uint16x32(_mm512_setzero_si512()); }


    ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }

};


struct v_int16x32

{

    typedef short lane_type;

    enum { nlanes = 32 };

    __m512i val;


    explicit v_int16x32(__m512i v) : val(v) {}

    v_int16x32(short v0,  short v1,  short v2,  short v3,  short v4,  short v5,  short v6,  short v7,

               short v8,  short v9,  short v10, short v11, short v12, short v13, short v14, short v15,

               short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23,

               short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31)

    {

        val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,

                              (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,

                              (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,

                              (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);

    }

    v_int16x32() {}


    static inline v_int16x32 zero() { return v_int16x32(_mm512_setzero_si512()); }


    short get0() const { return (short)_v_cvtsi512_si32(val); }

};


struct v_uint32x16

{

    typedef unsigned lane_type;

    enum { nlanes = 16 };

    __m512i val;


    explicit v_uint32x16(__m512i v) : val(v) {}

    v_uint32x16(unsigned v0,  unsigned v1,  unsigned v2,  unsigned v3,

                unsigned v4,  unsigned v5,  unsigned v6,  unsigned v7,

                unsigned v8,  unsigned v9,  unsigned v10, unsigned v11,

                unsigned v12, unsigned v13, unsigned v14, unsigned v15)

    {

        val = _mm512_setr_epi32((int)v0,  (int)v1,  (int)v2,  (int)v3, (int)v4,  (int)v5,  (int)v6,  (int)v7,

                                (int)v8,  (int)v9,  (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);

    }

    v_uint32x16() {}


    static inline v_uint32x16 zero() { return v_uint32x16(_mm512_setzero_si512()); }


    unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }

};


struct v_int32x16

{

    typedef int lane_type;

    enum { nlanes = 16 };

    __m512i val;


    explicit v_int32x16(__m512i v) : val(v) {}

    v_int32x16(int v0, int v1, int v2,  int v3,  int v4,  int v5,  int v6,  int v7,

               int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15)

    {

        val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);

    }

    v_int32x16() {}


    static inline v_int32x16 zero() { return v_int32x16(_mm512_setzero_si512()); }


    int get0() const { return _v_cvtsi512_si32(val); }

};


struct v_float32x16

{

    typedef float lane_type;

    enum { nlanes = 16 };

    __m512 val;


    explicit v_float32x16(__m512 v) : val(v) {}

    v_float32x16(float v0, float v1, float v2,  float v3,  float v4,  float v5,  float v6,  float v7,

                 float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15)

    {

        val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);

    }

    v_float32x16() {}


    static inline v_float32x16 zero() { return v_float32x16(_mm512_setzero_ps()); }


    float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }

};


struct v_uint64x8

{

    typedef uint64 lane_type;

    enum { nlanes = 8 };

    __m512i val;


    explicit v_uint64x8(__m512i v) : val(v) {}

    v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)

    { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }

    v_uint64x8() {}


    static inline v_uint64x8 zero() { return v_uint64x8(_mm512_setzero_si512()); }


    uint64 get0() const

    {

    #if defined __x86_64__ || defined _M_X64

        return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));

    #else

        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));

        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));

        return (unsigned)a | ((uint64)(unsigned)b << 32);

    #endif

    }

};


struct v_int64x8

{

    typedef int64 lane_type;

    enum { nlanes = 8 };

    __m512i val;


    explicit v_int64x8(__m512i v) : val(v) {}

    v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)

    { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }

    v_int64x8() {}


    static inline v_int64x8 zero() { return v_int64x8(_mm512_setzero_si512()); }


    int64 get0() const

    {

    #if defined __x86_64__ || defined _M_X64

        return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));

    #else

        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));

        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));

        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));

    #endif

    }

};


struct v_float64x8

{

    typedef double lane_type;

    enum { nlanes = 8 };

    __m512d val;


    explicit v_float64x8(__m512d v) : val(v) {}

    v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)

    { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }

    v_float64x8() {}


    static inline v_float64x8 zero() { return v_float64x8(_mm512_setzero_pd()); }


    double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }

};


#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp)                    \

    inline _Tpvec v512_load(const _Tp* ptr)                           \

    { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); }       \

    inline _Tpvec v512_load_aligned(const _Tp* ptr)                   \

    { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); }        \

    inline _Tpvec v512_load_low(const _Tp* ptr)                       \

    {                                                                 \

        __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr);       \

        return _Tpvec(_mm512_castsi256_si512(v256));                  \

    }                                                                 \

    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \

    {                                                                 \

        __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0);       \

        __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1);       \

        return _Tpvec(_v512_combine(vlo, vhi));                       \

    }                                                                 \

    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \

    { _mm512_storeu_si512((__m512i*)ptr, a.val); }                    \

    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \

    { _mm512_store_si512((__m512i*)ptr, a.val); }                     \

    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \

    { _mm512_stream_si512((__m512i*)ptr, a.val); }                    \

    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \

    { \

        if( mode == hal::STORE_UNALIGNED ) \

            _mm512_storeu_si512((__m512i*)ptr, a.val); \

        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \

            _mm512_stream_si512((__m512i*)ptr, a.val); \

        else \

            _mm512_store_si512((__m512i*)ptr, a.val); \

    } \

    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \

    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); }    \

    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \

    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }


OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64,  uchar)

OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64,   schar)

OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)

OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32,  short)

OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16,  unsigned)

OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16,   int)

OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8,  uint64)

OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8,   int64)


#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \

    inline _Tpvec v512_load(const _Tp* ptr)                               \

    { return _Tpvec(_mm512_loadu_##suffix(ptr)); }                        \

    inline _Tpvec v512_load_aligned(const _Tp* ptr)                       \

    { return _Tpvec(_mm512_load_##suffix(ptr)); }                         \

    inline _Tpvec v512_load_low(const _Tp* ptr)                           \

    {                                                                     \

        return _Tpvec(_mm512_cast##suffix##256_##suffix##512              \

                     (_mm256_loadu_##suffix(ptr)));                       \

    }                                                                     \

    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \

    {                                                                     \

        halfreg vlo = _mm256_loadu_##suffix(ptr0);                        \

        halfreg vhi = _mm256_loadu_##suffix(ptr1);                        \

        return _Tpvec(_v512_combine(vlo, vhi));                           \

    }                                                                     \

    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \

    { _mm512_storeu_##suffix(ptr, a.val); }                               \

    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \

    { _mm512_store_##suffix(ptr, a.val); }                                \

    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \

    { _mm512_stream_##suffix(ptr, a.val); }                               \

    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \

    { \

        if( mode == hal::STORE_UNALIGNED ) \

            _mm512_storeu_##suffix(ptr, a.val); \

        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \

            _mm512_stream_##suffix(ptr, a.val); \

        else \

            _mm512_store_##suffix(ptr, a.val); \

    } \

    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \

    { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); }            \

    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \

    { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }


OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float,  ps, __m256)

OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)


#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \

    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \

    { return _Tpvec(cast(a.val)); }


#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)         \

    inline _Tpvec v512_setzero_##suffix()                                          \

    { return _Tpvec(_mm512_setzero_si512()); }                                     \

    inline _Tpvec v512_setall_##suffix(_Tp v)                                      \

    { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,   suffix, OPENCV_HAL_NOP)      \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16,  suffix, OPENCV_HAL_NOP)      \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,   suffix, OPENCV_HAL_NOP)      \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,   suffix, OPENCV_HAL_NOP)      \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,    suffix, OPENCV_HAL_NOP)      \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8,  suffix, _mm512_castpd_si512)


OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64,  uchar,    u8,  epi8,   char)

OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64,   schar,    s8,  epi8,   char)

OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort,   u16, epi16,  short)

OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32,  short,    s16, epi16,  short)

OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32,  int)

OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16,  int,      s32, epi32,  int)

OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8,  uint64,   u64, epi64,  int64)

OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)


#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \

    inline _Tpvec v512_setzero_##suffix()                                   \

    { return _Tpvec(_mm512_setzero_##zsuffix()); }                          \

    inline _Tpvec v512_setall_##suffix(_Tp v)                               \

    { return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,  suffix, cast)          \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast)          \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,  suffix, cast)          \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,  suffix, cast)          \

    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,   suffix, cast)


OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float,  f32, ps, _mm512_castsi512_ps)

OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8,  double, f64, pd, _mm512_castsi512_pd)


inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a)

{ return a; }

inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a)

{ return v_float32x16(_mm512_castpd_ps(a.val)); }


inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a)

{ return a; }

inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)

{ return v_float64x8(_mm512_castps_pd(a.val)); }


// FP16

inline v_float32x16 v512_load_expand(const hfloat* ptr)

{

    return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));

}


inline void v_pack_store(hfloat* ptr, const v_float32x16& a)

{

    __m256i ah = _mm512_cvtps_ph(a.val, 0);

    _mm256_storeu_si256((__m256i*)ptr, ah);

}


/* Recombine & ZIP */

inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)

{

#if CV_AVX_512VBMI

    __m512i mask0 = _v512_set_epu8( 95,  31,  94,  30,  93,  29,  92,  28,  91,  27,  90,  26,  89,  25,  88,  24,

                                    87,  23,  86,  22,  85,  21,  84,  20,  83,  19,  82,  18,  81,  17,  80,  16,

                                    79,  15,  78,  14,  77,  13,  76,  12,  75,  11,  74,  10,  73,   9,  72,   8,

                                    71,   7,  70,   6,  69,   5,  68,   4,  67,   3,  66,   2,  65,   1,  64,   0);

    ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));

    __m512i mask1 = _v512_set_epu8(127,  63, 126,  62, 125,  61, 124,  60, 123,  59, 122,  58, 121,  57, 120,  56,

                                   119,  55, 118,  54, 117,  53, 116,  52, 115,  51, 114,  50, 113,  49, 112,  48,

                                   111,  47, 110,  46, 109,  45, 108,  44, 107,  43, 106,  42, 105,  41, 104,  40,

                                   103,  39, 102,  38, 101,  37, 100,  36,  99,  35,  98,  34,  97,  33,  96,  32);

    ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));

#else

    __m512i low  = _mm512_unpacklo_epi8(a.val, b.val);

    __m512i high = _mm512_unpackhi_epi8(a.val, b.val);

    ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2,  9,  8, 1, 0), high));

    ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));

#endif

}

inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)

{

    __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41,  9, 40,  8,

                                    39,  7, 38,  6, 37,  5, 36,  4, 35,  3, 34,  2, 33,  1, 32,  0);

    ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));

    __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,

                                    55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);

    ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));

}

inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)

{

    __m512i mask0 = _v512_set_epu32(23,  7, 22,  6, 21,  5, 20,  4, 19,  3, 18,  2, 17, 1, 16, 0);

    ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));

    __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);

    ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));

}

inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)

{

    __m512i mask0 = _v512_set_epu64(11, 3, 10, 2,  9, 1,  8, 0);

    ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));

    __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);

    ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));

}


inline void v_zip(const v_uint8x64&  a, const v_uint8x64&  b, v_uint8x64& ab0, v_uint8x64& ab1)

{

    v_int8x64 i0, i1;

    v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);

    ab0 = v_reinterpret_as_u8(i0);

    ab1 = v_reinterpret_as_u8(i1);

}

inline void v_zip(const v_uint16x32&  a, const v_uint16x32&  b, v_uint16x32& ab0, v_uint16x32& ab1)

{

    v_int16x32 i0, i1;

    v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);

    ab0 = v_reinterpret_as_u16(i0);

    ab1 = v_reinterpret_as_u16(i1);

}

inline void v_zip(const v_uint32x16&  a, const v_uint32x16&  b, v_uint32x16& ab0, v_uint32x16& ab1)

{

    v_int32x16 i0, i1;

    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);

    ab0 = v_reinterpret_as_u32(i0);

    ab1 = v_reinterpret_as_u32(i1);

}

inline void v_zip(const v_uint64x8&  a, const v_uint64x8&  b, v_uint64x8& ab0, v_uint64x8& ab1)

{

    v_int64x8 i0, i1;

    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);

    ab0 = v_reinterpret_as_u64(i0);

    ab1 = v_reinterpret_as_u64(i1);

}

inline void v_zip(const v_float32x16&  a, const v_float32x16&  b, v_float32x16& ab0, v_float32x16& ab1)

{

    v_int32x16 i0, i1;

    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);

    ab0 = v_reinterpret_as_f32(i0);

    ab1 = v_reinterpret_as_f32(i1);

}

inline void v_zip(const v_float64x8&  a, const v_float64x8&  b, v_float64x8& ab0, v_float64x8& ab1)

{

    v_int64x8 i0, i1;

    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);

    ab0 = v_reinterpret_as_f64(i0);

    ab1 = v_reinterpret_as_f64(i1);

}


#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix)                                    \

    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)                         \

    { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \

    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)                        \

    { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); }                    \

    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,                             \

                                  _Tpvec& c, _Tpvec& d)                                   \

    {                                                                                     \

        c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val));         \

        d.val = _v512_insert(b.val,_v512_extract_high(a.val));                            \

    }


OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64,   epi8)

OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64,    epi8)

OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32,  epi16)

OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32,   epi16)

OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16,  epi32)

OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16,   epi32)

OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8,   epi64)

OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8,    epi64)

OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)

OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8,  pd)


/* Element-wise binary and unary operations */


#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \

    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)      \

    { return _Tpvec(intrin(a.val, b.val)); }


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)


inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b)

{

    __m512i ad = _mm512_srai_epi16(a.val, 8);

    __m512i bd = _mm512_srai_epi16(b.val, 8);

    __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even

    __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd

    return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));

}

inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)

{

    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));

}


#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \

    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \

    { return _Tpvec(intrin(a.val, b.val)); }                             \

    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \

    { a.val = intrin(a.val, b.val); return a; }


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)

OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)

OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)

OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)

OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)

OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)

OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)

OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)


OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)

OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)

OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)

OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)

OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)

OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)

OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)

OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)

OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)

OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)

OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)


OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)

OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)

OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)

OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)

OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)

OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)

OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)

OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)


// saturating multiply

inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)

{

    v_uint16x32 c, d;

    v_mul_expand(a, b, c, d);

    return v_pack(c, d);

}

inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)

{

    v_int16x32 c, d;

    v_mul_expand(a, b, c, d);

    return v_pack(c, d);

}

inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)

{

    __m512i pl = _mm512_mullo_epi16(a.val, b.val);

    __m512i ph = _mm512_mulhi_epu16(a.val, b.val);

    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);

    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);


    const __m512i m = _mm512_set1_epi32(65535);

    return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));

}

inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)

{

    __m512i pl = _mm512_mullo_epi16(a.val, b.val);

    __m512i ph = _mm512_mulhi_epi16(a.val, b.val);

    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);

    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);

    return v_int16x32(_mm512_packs_epi32(p0, p1));

}


inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)

{ a = a * b; return a; }

inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)

{ a = a * b; return a; }

inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)

{ a = a * b; return a; }

inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)

{ a = a * b; return a; }


inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }

inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }


//  Multiply and expand

inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b,

                         v_uint16x32& c, v_uint16x32& d)

{

    v_uint16x32 a0, a1, b0, b1;

    v_expand(a, a0, a1);

    v_expand(b, b0, b1);

    c = v_mul_wrap(a0, b0);

    d = v_mul_wrap(a1, b1);

}


inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b,

                         v_int16x32& c, v_int16x32& d)

{

    v_int16x32 a0, a1, b0, b1;

    v_expand(a, a0, a1);

    v_expand(b, b0, b1);

    c = v_mul_wrap(a0, b0);

    d = v_mul_wrap(a1, b1);

}


inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b,

                         v_int32x16& c, v_int32x16& d)

{

    v_int16x32 v0, v1;

    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);


    c = v_reinterpret_as_s32(v0);

    d = v_reinterpret_as_s32(v1);

}


inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b,

                         v_uint32x16& c, v_uint32x16& d)

{

    v_uint16x32 v0, v1;

    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);


    c = v_reinterpret_as_u32(v0);

    d = v_reinterpret_as_u32(v1);

}


inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b,

                         v_uint64x8& c, v_uint64x8& d)

{

    v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),

          v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);

}


inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,

    v_int64x8& c, v_int64x8& d)

{

    v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),

          v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);

}


#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \

    inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \

    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \

    inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \

    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \

    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \

    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \

    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \

    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \

    template<int imm>                                             \

    inline _Tpuvec v_shl(const _Tpuvec& a)                        \

    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \

    template<int imm>                                             \

    inline _Tpsvec v_shl(const _Tpsvec& a)                        \

    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \

    template<int imm>                                             \

    inline _Tpuvec v_shr(const _Tpuvec& a)                        \

    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \

    template<int imm>                                             \

    inline _Tpsvec v_shr(const _Tpsvec& a)                        \

    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }


OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)

OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)

OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)


#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \

    OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \

    OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \

    OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \

    inline _Tpvec operator ~ (const _Tpvec& a)                     \

    { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }


OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))

OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64,    si512, _mm512_set1_epi32(-1))

OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32,  si512, _mm512_set1_epi32(-1))

OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32,   si512, _mm512_set1_epi32(-1))

OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16,  si512, _mm512_set1_epi32(-1))

OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16,   si512, _mm512_set1_epi32(-1))

OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8,   si512, _mm512_set1_epi64(-1))

OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8,    si512, _mm512_set1_epi64(-1))

OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps,    _mm512_castsi512_ps(_mm512_set1_epi32(-1)))

OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8,  pd,    _mm512_castsi512_pd(_mm512_set1_epi32(-1)))


#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf)                      \

    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \

    { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }


OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64,   epi8, si512)

OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64,    epi8, si512)

OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)

OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32,  epi16, si512)

OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)

OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16,  epi32, si512)

OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8,  epi64, si512)

OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8,   epi64, si512)

OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16,   ps,    ps)

OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)


#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \

    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \

    { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }


#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \

    OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \

    OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \

    OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \

    OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \

    OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \

    OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)


OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)

OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)

OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)

OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32,  epi16, epi16, (short)-1)

OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)

OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16,  epi32, epi32, (int)-1)

OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)

OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)


#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \

    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \

    { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }


#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \

    OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \

    OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \

    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \

    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \

    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \

    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)


OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)

OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)


inline v_float32x16 v_not_nan(const v_float32x16& a)

{ return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }

inline v_float64x8 v_not_nan(const v_float64x8& a)

{ return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }


OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64,   _mm512_min_epu8)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64,   _mm512_max_epu8)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64,    _mm512_min_epi8)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64,    _mm512_max_epi8)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32,  _mm512_min_epu16)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32,  _mm512_max_epu16)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32,   _mm512_min_epi16)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32,   _mm512_max_epi16)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16,  _mm512_min_epu32)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16,  _mm512_max_epu32)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16,   _mm512_min_epi32)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16,   _mm512_max_epi32)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8,   _mm512_min_epu64)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8,   _mm512_max_epu64)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8,    _mm512_min_epi64)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8,    _mm512_max_epi64)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8,  _mm512_min_pd)

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8,  _mm512_max_pd)


namespace {

    template<bool prec, int imm4, bool part, int imm32>

    struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};

    template<int imm4, int imm32>

    struct _v_rotate_right<true, imm4, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)

    {

        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32    ),    imm4 *8),

                                         _mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));

    }};

    template<int imm4>

    struct _v_rotate_right<true, imm4, false, 15> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)

    {

        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15),    imm4 *8),

                                         _mm512_slli_epi32(                                b.val, (4-imm4)*8)));

    }};

    template<int imm4, int imm32>

    struct _v_rotate_right<true, imm4, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)

    {

        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16),    imm4 *8),

                                         _mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));

    }};

    template<int imm4>

    struct _v_rotate_right<true, imm4, true, 31> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)

    { return v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};

    template<int imm32>

    struct _v_rotate_right<false, 0, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)

    { return v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};

    template<>

    struct _v_rotate_right<false, 0, false, 0> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64&) { return a; }};

    template<int imm32>

    struct _v_rotate_right<false, 0, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)

    { return v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};

    template<>

    struct _v_rotate_right<false, 0, true, 16> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) { return b; }};

    template<>

    struct _v_rotate_right<false, 0, true, 32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};

}

template<int imm> inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b)

{

    return imm >= 128 ? v_int8x64() :

#if CV_AVX_512VBMI

    v_int8x64(_mm512_permutex2var_epi8(a.val,

    _v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,

                   0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,

                   0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,

                   0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,

                   0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,

                   0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,

                   0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,

                   0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));

#else

    _v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);

#endif

}

template<int imm>

inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b)

{

    if (imm == 0) return a;

    if (imm == 64) return b;

    if (imm >= 128) return v_int8x64();

#if CV_AVX_512VBMI

    return v_int8x64(_mm512_permutex2var_epi8(b.val,

           _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,

                          0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,

                          0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,

                          0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,

                          0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,

                          0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,

                          0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,

                          0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));

#else

    return imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);

#endif

}

template<int imm>

inline v_int8x64 v_rotate_right(const v_int8x64& a)

{

    if (imm == 0) return a;

    if (imm >= 64) return v_int8x64();

#if CV_AVX_512VBMI

    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,

           _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,

                          0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,

                          0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,

                          0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,

                          0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,

                          0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,

                          0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,

                          0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));

#else

    return v_rotate_right<imm>(a, v512_setzero_s8());

#endif

}

template<int imm>

inline v_int8x64 v_rotate_left(const v_int8x64& a)

{

    if (imm == 0) return a;

    if (imm >= 64) return v_int8x64();

#if CV_AVX_512VBMI

    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,

           _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,

                          0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,

                          0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,

                          0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,

                          0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,

                          0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,

                          0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,

                          0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));

#else

    return v_rotate_right<64 - imm>(v512_setzero_s8(), a);

#endif

}


#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix)                                                                                   \

template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                            \

{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }      \

template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                           \

{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }     \

template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                             \

{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }                              \

template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                            \

{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }


#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix)                                                                                   \

template<int imm>                                                                                                                          \

inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                                              \

{                                                                                                                                          \

    enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \

    enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \

    if (imm == 0) return a;                                                                                                                \

    if (imm == _Tpvec::nlanes) return b;                                                                                                   \

    if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \

    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \

}                                                                                                                                          \

template<int imm>                                                                                                                          \

inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                                             \

{                                                                                                                                          \

    enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \

    enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \

    if (imm == 0) return a;                                                                                                                \

    if (imm == _Tpvec::nlanes) return b;                                                                                                   \

    if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \

    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \

}                                                                                                                                          \

template<int imm>                                                                                                                          \

inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                                               \

{                                                                                                                                          \

    if (imm == 0) return a;                                                                                                                \

    if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \

    return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                              \

}                                                                                                                                          \

template<int imm>                                                                                                                          \

inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                                              \

{                                                                                                                                          \

    if (imm == 0) return a;                                                                                                                \

    if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \

    return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                            \

}


OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64,   u8)

OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32,  u16)

OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32,   s16)

OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16,  epi32)

OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16,   epi32)

OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8,   epi64)

OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8,    epi64)

OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)

OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8,  pd)


inline v_uint8x64 v_reverse(const v_uint8x64 &a)

{

#if CV_AVX_512VBMI

    static const __m512i perm = _mm512_set_epi32(

            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,

            0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,

            0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,

            0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);

    return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));

#else

    static const __m512i shuf = _mm512_set_epi32(

            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,

            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,

            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,

            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);

    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);

    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);

    return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));

#endif

}


inline v_int8x64 v_reverse(const v_int8x64 &a)

{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }


inline v_uint16x32 v_reverse(const v_uint16x32 &a)

{

#if CV_AVX_512VBMI

    static const __m512i perm = _mm512_set_epi32(

            0x00000001, 0x00020003, 0x00040005, 0x00060007,

            0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,

            0x00100011, 0x00120013, 0x00140015, 0x00160017,

            0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);

    return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));

#else

    static const __m512i shuf = _mm512_set_epi32(

            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,

            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,

            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,

            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);

    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);

    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);

    return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));

#endif

}


inline v_int16x32 v_reverse(const v_int16x32 &a)

{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }


inline v_uint32x16 v_reverse(const v_uint32x16 &a)

{

    static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);

    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));

}


inline v_int32x16 v_reverse(const v_int32x16 &a)

{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }


inline v_float32x16 v_reverse(const v_float32x16 &a)

{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }


inline v_uint64x8 v_reverse(const v_uint64x8 &a)

{

    static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);

    return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));

}


inline v_int64x8 v_reverse(const v_int64x8 &a)

{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }


inline v_float64x8 v_reverse(const v_float64x8 &a)

{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }


#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b

#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop)                                          \

    inline sctype v_reduce_##func(const _Tpvec& a)                                                                  \

    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));                           \

      sctype CV_DECL_ALIGNED(64) idx[2];                                                                            \

      _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \

      return scop(idx[0], idx[1]); }

OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min)

OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max)

OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)

OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  min, v_int64x8,  min_epi64, min)

OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  max, v_int64x8,  max_epi64, max)

OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  sum, v_int64x8,  add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)


#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop)                                         \

    inline double v_reduce_##func(const v_float64x8& a)                                             \

    { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));           \

      double CV_DECL_ALIGNED(64) idx[2];                                                            \

      _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \

      return scop(idx[0], idx[1]); }

OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min)

OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max)

OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)


#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc)                                 \

    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \

    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \

      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \

      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \

      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \

      return (sctype)_mm_cvtsi128_si32(quarter); }

OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32)

OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32)

OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  min, v_int32x16,  min_epi32)

OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  max, v_int32x16,  max_epi32)


#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc)                                            \

    inline float v_reduce_##func(const v_float32x16& a)                                           \

    { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));          \

      __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \

      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2)));           \

      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1)));           \

      return _mm_cvtss_f32(quarter); }

OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)

OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)


inline float v_reduce_sum(const v_float32x16& a)

{

    __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));

    __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));

    quarter = _mm_hadd_ps(quarter, quarter);

    return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));

}

inline int v_reduce_sum(const v_int32x16& a)

{

    __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));

    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));

    quarter = _mm_hadd_epi32(quarter, quarter);

    return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));

}

inline uint v_reduce_sum(const v_uint32x16& a)

{ return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); }


#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc)                                 \

    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \

    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \

      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \

      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \

      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \

      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \

      return (sctype)_mm_cvtsi128_si32(quarter); }

OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16)

OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16)

OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  min, v_int16x32,  min_epi16)

OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  max, v_int16x32,  max_epi16)


inline int v_reduce_sum(const v_int16x32& a)

{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }

inline uint v_reduce_sum(const v_uint16x32& a)

{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }


#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \

    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \

    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \

      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \

      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \

      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \

      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \

      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1));                                     \

      return (sctype)_mm_cvtsi128_si32(quarter); }

OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8)

OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8)

OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64,  min_epi8)

OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64,  max_epi8)


#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix)                                    \

    inline sctype v_reduce_sum(const _Tpvec& a)                                                         \

    {   __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)),            \

                                       _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val)));          \

        a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \

        __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16));                 \

        __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1));        \

        a4 = _mm_hadd_epi32(a4, a4);                                                                    \

        return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }

OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)

OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int,  v_int8x64,  epi8)


inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b,

                                  const v_float32x16& c, const v_float32x16& d)

{

    __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));

    __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));

    __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));

    __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));

    return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));

}


inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b)

{

    __m512i val = _mm512_sad_epu8(a.val, b.val);

    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));

    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));

    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));

}

inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)

{

    __m512i val = _mm512_set1_epi8(-128);

    val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));

    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));

    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));

    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));

}

inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)

{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }

inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)

{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }

inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)

{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }

inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)

{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }

inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)

{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }

inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)

{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }


inline v_uint8x64 v_popcount(const v_int8x64& a)

{

#if CV_AVX_512BITALG

    return v_uint8x64(_mm512_popcnt_epi8(a.val));

#elif CV_AVX_512VBMI

    __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,

                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,

                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,

                                            4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);

    __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,

                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,

                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,

                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);

    return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));

#else

    __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);

    __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);


    return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(                  a.val,     _popcnt_mask)),

                                      _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));

#endif

}

inline v_uint16x32 v_popcount(const v_int16x32& a)

{

#if CV_AVX_512BITALG

    return v_uint16x32(_mm512_popcnt_epi16(a.val));

#elif CV_AVX_512VPOPCNTDQ

    __m512i zero = _mm512_setzero_si512();

    return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),

                                          _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));

#else

    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));

    p += v_rotate_right<1>(p);

    return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);

#endif

}

inline v_uint32x16 v_popcount(const v_int32x16& a)

{

#if CV_AVX_512VPOPCNTDQ

    return v_uint32x16(_mm512_popcnt_epi32(a.val));

#else

    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));

    p += v_rotate_right<1>(p);

    p += v_rotate_right<2>(p);

    return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);

#endif

}

inline v_uint64x8 v_popcount(const v_int64x8& a)

{

#if CV_AVX_512VPOPCNTDQ

    return v_uint64x8(_mm512_popcnt_epi64(a.val));

#else

    return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));

#endif

}


inline v_uint8x64  v_popcount(const v_uint8x64&  a) { return v_popcount(v_reinterpret_as_s8 (a)); }

inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); }

inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); }

inline v_uint64x8  v_popcount(const v_uint64x8&  a) { return v_popcount(v_reinterpret_as_s64(a)); }


#if CV_FMA3

#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                         \

    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \

    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \

    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \

    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }

#else

#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                                 \

    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)            \

    { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); } \

    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)         \

    { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); }

#endif


#define OPENCV_HAL_IMPL_AVX512_MISC(_Tpvec, suffix)                           \

    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \

    { return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \

    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \

    { return v_fma(a, a, b * b); }                                            \

    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \

    { return v_sqrt(v_fma(a, a, b * b)); }


OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)

OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)

OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)

OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8,  pd)


inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)

{ return a * b + c; }

inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)

{ return v_fma(a, b, c); }


inline v_float32x16 v_invsqrt(const v_float32x16& x)

{

#if CV_AVX_512ER

    return v_float32x16(_mm512_rsqrt28_ps(x.val));

#else

    v_float32x16 half = x * v512_setall_f32(0.5);

    v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));

    t *= v512_setall_f32(1.5) - ((t * t) * half);

    return t;

#endif

}


inline v_float64x8 v_invsqrt(const v_float64x8& x)

{

#if CV_AVX_512ER

    return v_float64x8(_mm512_rsqrt28_pd(x.val));

#else

    return v512_setall_f64(1.) / v_sqrt(x);

//    v_float64x8 half = x * v512_setall_f64(0.5);

//    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));

//    t *= v512_setall_f64(1.5) - ((t * t) * half);

//    t *= v512_setall_f64(1.5) - ((t * t) * half);

//    return t;

#endif

}


#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \

    inline _Tpuvec v_abs(const _Tpvec& x)                   \

    { return _Tpuvec(_mm512_abs_##suffix(x.val)); }


OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64,    v_uint8x64,    epi8)

OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32,   v_uint16x32,  epi16)

OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16,   v_uint32x16,  epi32)

OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8,    v_uint64x8,   epi64)


inline v_float32x16 v_abs(const v_float32x16& x)

{

#ifdef _mm512_abs_pd

    return v_float32x16(_mm512_abs_ps(x.val));

#else

    return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),

                        _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,

                                        0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));

#endif

}


inline v_float64x8 v_abs(const v_float64x8& x)

{

#ifdef _mm512_abs_pd

    #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))

        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476

        return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));

    #else

        return v_float64x8(_mm512_abs_pd(x.val));

    #endif

#else

    return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),

                       _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,

                                       0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));

#endif

}


inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)

{ return v_add_wrap(a - b,  b - a); }

inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)

{ return v_add_wrap(a - b,  b - a); }

inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)

{ return v_max(a, b) - v_min(a, b); }


inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)

{

    v_int8x64 d = v_sub_wrap(a, b);

    v_int8x64 m = a < b;

    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));

}


inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)

{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }


inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)

{

    v_int32x16 d = a - b;

    v_int32x16 m = a < b;

    return v_reinterpret_as_u32((d ^ m) - m);

}


inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)

{ return v_abs(a - b); }


inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)

{ return v_abs(a - b); }


inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)

{

    v_int8x64 d = a - b;

    v_int8x64 m = a < b;

    return (d ^ m) - m;

}

inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)

{ return v_max(a, b) - v_min(a, b); }


inline v_int32x16 v_round(const v_float32x16& a)

{ return v_int32x16(_mm512_cvtps_epi32(a.val)); }


inline v_int32x16 v_round(const v_float64x8& a)

{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }


inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b)

{ return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }


inline v_int32x16 v_trunc(const v_float32x16& a)

{ return v_int32x16(_mm512_cvttps_epi32(a.val)); }


inline v_int32x16 v_trunc(const v_float64x8& a)

{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }


#if CVT_ROUND_MODES_IMPLEMENTED

inline v_int32x16 v_floor(const v_float32x16& a)

{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }


inline v_int32x16 v_floor(const v_float64x8& a)

{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }


inline v_int32x16 v_ceil(const v_float32x16& a)

{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }


inline v_int32x16 v_ceil(const v_float64x8& a)

{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }

#else

inline v_int32x16 v_floor(const v_float32x16& a)

{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }


inline v_int32x16 v_floor(const v_float64x8& a)

{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }


inline v_int32x16 v_ceil(const v_float32x16& a)

{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }


inline v_int32x16 v_ceil(const v_float64x8& a)

{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }

#endif


inline v_float32x16 v_cvt_f32(const v_int32x16& a)

{ return v_float32x16(_mm512_cvtepi32_ps(a.val)); }


inline v_float32x16 v_cvt_f32(const v_float64x8& a)

{ return v_float32x16(_mm512_cvtpd_pslo(a.val)); }


inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b)

{ return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }


inline v_float64x8 v_cvt_f64(const v_int32x16& a)

{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }


inline v_float64x8 v_cvt_f64_high(const v_int32x16& a)

{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }


inline v_float64x8 v_cvt_f64(const v_float32x16& a)

{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }


inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)

{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }


// from (Mysticial and wim) https://stackoverflow.com/q/41144668

inline v_float64x8 v_cvt_f64(const v_int64x8& v)

{

#if CV_AVX_512DQ

    return v_float64x8(_mm512_cvtepi64_pd(v.val));

#else

    // constants encoded as floating-point

    __m512i magic_i_lo   = _mm512_set1_epi64(0x4330000000000000); // 2^52

    __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63

    __m512i magic_i_all  = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52

    __m512d magic_d_all  = _mm512_castsi512_pd(magic_i_all);


    // Blend the 32 lowest significant bits of v with magic_int_lo

    __m512i v_lo         = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);

    // Extract the 32 most significant bits of v

    __m512i v_hi         = _mm512_srli_epi64(v.val, 32);

    // Flip the msb of v_hi and blend with 0x45300000

            v_hi         = _mm512_xor_si512(v_hi, magic_i_hi32);

    // Compute in double precision

    __m512d v_hi_dbl     = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);

    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition

    __m512d result       = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));

    return v_float64x8(result);

#endif

}


inline v_int8x64 v512_lut(const schar* tab, const int* idx)

{

    __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));

    __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));

    __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1));

    __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1));

    return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));

}

inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx)

{

    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));

    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));

    return v_int8x64(_v512_combine(p0, p1));

}

inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx)

{

    return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1));

}

inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); }

inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); }

inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); }


inline v_int16x32 v512_lut(const short* tab, const int* idx)

{

    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 2));

    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2));

    return v_int16x32(_v512_combine(p0, p1));

}

inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx)

{

    return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2));

}

inline v_int16x32 v512_lut_quads(const short* tab, const int* idx)

{

#if defined(__GNUC__)

    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2));

#else

    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2));

#endif

}

inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); }

inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); }

inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); }


inline v_int32x16 v512_lut(const int* tab, const int* idx)

{

    return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4));

}

inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx)

{

#if defined(__GNUC__)

    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4));

#else

    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4));

#endif

}

inline v_int32x16 v512_lut_quads(const int* tab, const int* idx)

{

    return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(

                          _mm_loadu_si128((const __m128i*)(tab + idx[0]))),

                          _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),

                          _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),

                          _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));

}

inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); }

inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); }

inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); }


inline v_int64x8 v512_lut(const int64* tab, const int* idx)

{

#if defined(__GNUC__)

    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8));

#else

    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8));

#endif

}

inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx)

{

    return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(

                         _mm_loadu_si128((const __m128i*)(tab + idx[0]))),

                         _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),

                         _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),

                         _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));

}

inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); }

inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); }


inline v_float32x16 v512_lut(const float* tab, const int* idx)

{

    return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4));

}

inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); }

inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); }


inline v_float64x8 v512_lut(const double* tab, const int* idx)

{

    return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8));

}

inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx)

{

        return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(

                               _mm_loadu_pd(tab + idx[0])),

                               _mm_loadu_pd(tab + idx[1]), 1),

                               _mm_loadu_pd(tab + idx[2]), 2),

                               _mm_loadu_pd(tab + idx[3]), 3));

}


inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec)

{

    return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));

}


inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec)

{

    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));

}


inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec)

{

    return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));

}


inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec)

{

    return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));

}


inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)

{

    x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);

    y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);

}


inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)

{

    x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);

    y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);

}


inline v_int8x64 v_interleave_pairs(const v_int8x64& vec)

{

    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));

}

inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }

inline v_int8x64 v_interleave_quads(const v_int8x64& vec)

{

    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));

}

inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }


inline v_int16x32 v_interleave_pairs(const v_int16x32& vec)

{

    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));

}

inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }

inline v_int16x32 v_interleave_quads(const v_int16x32& vec)

{

    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));

}

inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }


inline v_int32x16 v_interleave_pairs(const v_int32x16& vec)

{

    return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));

}

inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }

inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }


inline v_int8x64 v_pack_triplets(const v_int8x64& vec)

{

    return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,

                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),

                                              _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));

}

inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }


inline v_int16x32 v_pack_triplets(const v_int16x32& vec)

{

    return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,

                                                               0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));

}

inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }


inline v_int32x16 v_pack_triplets(const v_int32x16& vec)

{

    return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,

                                                               0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));

}

inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }

inline v_float32x16 v_pack_triplets(const v_float32x16& vec)

{

    return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,

                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));

}


// 16 >> 32

inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)

{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }

inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)

{ return v_dotprod(a, b) + c; }


// 32 >> 64

inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)

{

    __m512i even = _mm512_mul_epi32(a.val, b.val);

    __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));

    return v_int64x8(_mm512_add_epi64(even, odd));

}

inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)

{ return v_dotprod(a, b) + c; }


// 8 >> 32

inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)

{

    __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());

    __m512i odd_a  = _mm512_srli_epi16(a.val, 8);


    __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());

    __m512i odd_b  = _mm512_srli_epi16(b.val, 8);


    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);

    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);

    return v_uint32x16(_mm512_add_epi32(prod0, prod1));

}

inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)

{ return v_dotprod_expand(a, b) + c; }


inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)

{

    __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);

    __m512i odd_a  = _mm512_srai_epi16(a.val, 8);


    __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);

    __m512i odd_b  = _mm512_srai_epi16(b.val, 8);


    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);

    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);

    return v_int32x16(_mm512_add_epi32(prod0, prod1));

}

inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)

{ return v_dotprod_expand(a, b) + c; }


// 16 >> 64

inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)

{

    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);

    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);

    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);

    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);


    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());

    __m512i p13   = _mm512_srli_epi64(mul0, 32);

    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());

    __m512i p57   = _mm512_srli_epi64(mul1, 32);


    __m512i p15_  = _mm512_add_epi64(p02, p13);

    __m512i p9d_  = _mm512_add_epi64(p46, p57);


    return v_uint64x8(_mm512_add_epi64(

        _mm512_unpacklo_epi64(p15_, p9d_),

        _mm512_unpackhi_epi64(p15_, p9d_)

    ));

}

inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)

{ return v_dotprod_expand(a, b) + c; }


inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)

{

    __m512i prod = _mm512_madd_epi16(a.val, b.val);

    __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);

    __m512i odd  = _mm512_srai_epi64(prod, 32);

    return v_int64x8(_mm512_add_epi64(even, odd));

}

inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)

{ return v_dotprod_expand(a, b) + c; }


// 32 >> 64f

inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)

{ return v_cvt_f64(v_dotprod(a, b)); }

inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)

{ return v_dotprod_expand(a, b) + c; }


// 16 >> 32

inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)

{ return v_dotprod(a, b); }

inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)

{ return v_dotprod(a, b, c); }


// 32 >> 64

inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)

{ return v_dotprod(a, b); }

inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)

{ return v_dotprod(a, b, c); }


// 8 >> 32

inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)

{ return v_dotprod_expand(a, b); }

inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)

{ return v_dotprod_expand(a, b, c); }


inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)

{ return v_dotprod_expand(a, b); }

inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)

{ return v_dotprod_expand(a, b, c); }


// 16 >> 64

inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)

{

    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);

    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);

    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);

    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);


    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());

    __m512i p13   = _mm512_srli_epi64(mul0, 32);

    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());

    __m512i p57   = _mm512_srli_epi64(mul1, 32);


    __m512i p15_  = _mm512_add_epi64(p02, p13);

    __m512i p9d_  = _mm512_add_epi64(p46, p57);

    return v_uint64x8(_mm512_add_epi64(p15_, p9d_));

}

inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)

{ return v_dotprod_expand_fast(a, b) + c; }


inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)

{ return v_dotprod_expand(a, b); }

inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)

{ return v_dotprod_expand(a, b, c); }


// 32 >> 64f

inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)

{ return v_dotprod_expand(a, b); }

inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)

{ return v_dotprod_expand(a, b) + c; }


#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \

    v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))


inline v_float32x16 v_matmul(const v_float32x16& v,

                             const v_float32x16& m0, const v_float32x16& m1,

                             const v_float32x16& m2, const v_float32x16& m3)

{

    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);

    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);

    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);

    v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);

    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));

}


inline v_float32x16 v_matmuladd(const v_float32x16& v,

                                const v_float32x16& m0, const v_float32x16& m1,

                                const v_float32x16& m2, const v_float32x16& a)

{

    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);

    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);

    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);

    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));

}


#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \

    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \

                               const _Tpvec& a2, const _Tpvec& a3,              \

                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \

    {                                                                           \

        __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val));       \

        __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val));       \

        __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val));       \

        __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val));       \

        b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1));                        \

        b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1));                        \

        b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3));                        \

        b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3));                        \

    }


OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)

OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)

OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)


/* Expand */

#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \

    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \

    {                                                               \

        b0.val = intrin(_v512_extract_low(a.val));                  \

        b1.val = intrin(_v512_extract_high(a.val));                 \

    }                                                               \

    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \

    { return _Tpwvec(intrin(_v512_extract_low(a.val))); }           \

    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \

    { return _Tpwvec(intrin(_v512_extract_high(a.val))); }          \

    inline _Tpwvec v512_load_expand(const _Tp* ptr)                 \

    {                                                               \

        __m256i a = _mm256_loadu_si256((const __m256i*)ptr);        \

        return _Tpwvec(intrin(a));                                  \

    }


OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64,  v_uint16x32, uchar,    _mm512_cvtepu8_epi16)

OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64,   v_int16x32,  schar,    _mm512_cvtepi8_epi16)

OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort,   _mm512_cvtepu16_epi32)

OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32,  v_int32x16,  short,    _mm512_cvtepi16_epi32)

OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8,  unsigned, _mm512_cvtepu32_epi64)

OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16,  v_int64x8,   int,      _mm512_cvtepi32_epi64)


#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \

    inline _Tpvec v512_load_expand_q(const _Tp* ptr)         \

    {                                                        \

        __m128i a = _mm_loadu_si128((const __m128i*)ptr);    \

        return _Tpvec(intrin(a));                            \

    }


OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)

OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16,  schar, _mm512_cvtepi8_epi32)


/* pack */

// 16

inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)

{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }


inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)

{

    const __m512i t = _mm512_set1_epi16(255);

    return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));

}


inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)

{

    return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));

}


inline void v_pack_store(schar* ptr, const v_int16x32& a)

{ v_store_low(ptr, v_pack(a, a)); }


inline void v_pack_store(uchar* ptr, const v_uint16x32& a)

{

    const __m512i m = _mm512_set1_epi16(255);

    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));

}


inline void v_pack_u_store(uchar* ptr, const v_int16x32& a)

{ v_store_low(ptr, v_pack_u(a, a)); }


template<int n> inline

v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)

{

    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.

    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));

    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),

                    v_reinterpret_as_s16((b + delta) >> n));

}


template<int n> inline

void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)

{

    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));

    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));

}


template<int n> inline

v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)

{

    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));

    return v_pack_u((a + delta) >> n, (b + delta) >> n);

}


template<int n> inline

void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)

{

    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));

    v_pack_u_store(ptr, (a + delta) >> n);

}


template<int n> inline

v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)

{

    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));

    return v_pack((a + delta) >> n, (b + delta) >> n);

}


template<int n> inline

void v_rshr_pack_store(schar* ptr, const v_int16x32& a)

{

    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));

    v_pack_store(ptr, (a + delta) >> n);

}


// 32

inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)

{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }


inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)

{

    const __m512i m = _mm512_set1_epi32(65535);

    return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));

}


inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)

{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }


inline void v_pack_store(short* ptr, const v_int32x16& a)

{ v_store_low(ptr, v_pack(a, a)); }


inline void v_pack_store(ushort* ptr, const v_uint32x16& a)

{

    const __m512i m = _mm512_set1_epi32(65535);

    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));

}


inline void v_pack_u_store(ushort* ptr, const v_int32x16& a)

{ v_store_low(ptr, v_pack_u(a, a)); }


template<int n> inline

v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)

{

    v_uint32x16 delta = v512_setall_u32(1 << (n-1));

    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),

                    v_reinterpret_as_s32((b + delta) >> n));

}


template<int n> inline

void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)

{

    v_uint32x16 delta = v512_setall_u32(1 << (n-1));

    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));

}


template<int n> inline

v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)

{

    v_int32x16 delta = v512_setall_s32(1 << (n-1));

    return v_pack_u((a + delta) >> n, (b + delta) >> n);

}


template<int n> inline

void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)

{

    v_int32x16 delta = v512_setall_s32(1 << (n-1));

    v_pack_u_store(ptr, (a + delta) >> n);

}


template<int n> inline

v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)

{

    v_int32x16 delta = v512_setall_s32(1 << (n-1));

    return v_pack((a + delta) >> n, (b + delta) >> n);

}


template<int n> inline

void v_rshr_pack_store(short* ptr, const v_int32x16& a)

{

    v_int32x16 delta = v512_setall_s32(1 << (n-1));

    v_pack_store(ptr, (a + delta) >> n);

}


// 64

// Non-saturating pack

inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b)

{ return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }


inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b)

{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }


inline void v_pack_store(unsigned* ptr, const v_uint64x8& a)

{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }


inline void v_pack_store(int* ptr, const v_int64x8& b)

{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }


template<int n> inline

v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)

{

    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));

    return v_pack((a + delta) >> n, (b + delta) >> n);

}


template<int n> inline

void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)

{

    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));

    v_pack_store(ptr, (a + delta) >> n);

}


template<int n> inline

v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)

{

    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));

    return v_pack((a + delta) >> n, (b + delta) >> n);

}


template<int n> inline

void v_rshr_pack_store(int* ptr, const v_int64x8& a)

{

    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));

    v_pack_store(ptr, (a + delta) >> n);

}


// pack boolean

inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)

{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }


inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,

                           const v_uint32x16& c, const v_uint32x16& d)

{

    __m512i ab = _mm512_packs_epi32(a.val, b.val);

    __m512i cd = _mm512_packs_epi32(c.val, d.val);


    return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));

}


inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,

                           const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f,

                           const v_uint64x8& g, const v_uint64x8& h)

{

    __m512i ab = _mm512_packs_epi32(a.val, b.val);

    __m512i cd = _mm512_packs_epi32(c.val, d.val);

    __m512i ef = _mm512_packs_epi32(e.val, f.val);

    __m512i gh = _mm512_packs_epi32(g.val, h.val);


    __m512i abcd = _mm512_packs_epi32(ab, cd);

    __m512i efgh = _mm512_packs_epi32(ef, gh);


    return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,

                                                               27, 19, 11, 3, 26, 18, 10, 2, 25, 17,  9, 1, 24, 16,  8, 0), _mm512_packs_epi16(abcd, efgh)));

}


/* Recombine */

// its up there with load and store operations


/* Extract */

#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec)                \

    template<int s>                                           \

    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \

    { return v_rotate_right<s>(a, b); }


OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)

OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)

OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)

OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)

OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)

OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)

OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)

OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)

OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)

OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)


#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \

template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }


OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short)

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int)

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float)

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double)


template<int i>

inline v_uint32x16 v_broadcast_element(v_uint32x16 a)

{

    static const __m512i perm = _mm512_set1_epi32((char)i);

    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));

}


template<int i>

inline v_int32x16 v_broadcast_element(const v_int32x16 &a)

{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }


template<int i>

inline v_float32x16 v_broadcast_element(const v_float32x16 &a)

{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }


inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b )

{

    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));

#if CV_AVX_512VBMI

    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,

                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,

                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,

                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);

    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,

                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,

                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,

                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);

    a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));

    b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));

#else

    __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);

    __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);

    __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);

    __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);

    __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);

    a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));

    b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));

#endif

}


inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b )

{

    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));

    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,

                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);

    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,

                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);

    a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));

    b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));

}


inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b )

{

    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));

    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);

    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);

    a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));

    b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));

}


inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b )

{

    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));

    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);

    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);

    a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));

    b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));

}


inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c )

{

    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));

    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));


#if CV_AVX_512VBMI2

    __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102,  99,  96,  93,  90,  87,  84, 81,

                                    78,  75,  72,  69,  66,  63,  60,  57,  54,  51,  48,  45,  42,  39,  36, 33,

                                    30,  27,  24,  21,  18,  15,  12,   9,   6,   3,   0,  62,  59,  56,  53, 50,

                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,  2);

    __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);

    __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);

    __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,

                    _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83, 80,

                                    77,  74,  71,  68,  65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,

                                    94,  91,  88,  85,  82,  79,  76,  73,  70,  67,  64,  61,  58,  55,  52, 49,

                                    46,  43,  40,  37,  34,  31,  28,  25,  22,  19,  16,  13,  10,   7,   4,  1), bgr2);

    a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));

    b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));

    c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));

#elif CV_AVX_512VBMI

    __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);

    __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);

    __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);

    a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83,  80,

                                                                    77,  74,  71,  68,  65,  63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,

                                                                    46,  45,  43,  42,  40,  39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,

                                                                    23,  21,  20,  18,  17,  15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0), bgr2));

    b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,  46,  45,  43,  42,  40,

                                                                    39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,  23,  21,  20,  18,  17,

                                                                    15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0, 126, 123, 120, 117, 114,

                                                                   111, 108, 105, 102,  99,  96,  93,  90,  87,  84,  81,  78,  75,  72,  69,  66), bgr0));

    c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63,  60,  57,  54,  51,  48,  45,  42,  39,  36,  33,  30,  27,  24,  21,  18,

                                                                    15,  12,   9,   6,   3,   0, 125, 122, 119, 116, 113, 110, 107, 104, 101,  98,

                                                                    95,  92,  89,  86,  83,  80,  77,  74,  71,  68,  65,  62,  59,  56,  53,  50,

                                                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,   2), bgr1));

#else

    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,

                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);

    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);

    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);

    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);


    __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);

    __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,

                                                                   14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);

    __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);

    a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));

    c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));

    b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));

#endif

}


inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c )

{

    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));

    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));


    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,

                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);

    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);

    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);

    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);


    a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));

    b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,

                                                                    14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));

    c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));

}


inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )

{

    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));

    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));


    __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);

    __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);

    __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);

    __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);


    a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));

    b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));

    c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));

}


inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c )

{

    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));

    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));


    __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);

    __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);

    __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);

    __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);


    a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));

    c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));

    b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));

}


inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d )

{

    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));

    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));

    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192));


#if CV_AVX_512VBMI

    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,

                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,

                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,

                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);

    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,

                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,

                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,

                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);


    __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);

    __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);

    __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);

    __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);


    a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));

    c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));

    b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));

    d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));

#else

    __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);

    __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);

    __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);

    __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);

    __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);


    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);

    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);


    __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);

    __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);

    __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);

    __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);


    a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));

    c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));

    b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));

    d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));

#endif

}


inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d )

{

    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));

    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));

    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96));


    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,

                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);

    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,

                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);


    __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);

    __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);

    __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);

    __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);


    a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));

    c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));

    b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));

    d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));

}


inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )

{

    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));

    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));

    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48));


    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);

    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);


    __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);

    __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);

    __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);

    __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);


    a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));

    c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));

    b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));

    d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));

}


inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d )

{

    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);

    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));

    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));

    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24));


    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);

    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);


    __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);

    __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);

    __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);

    __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);


    a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));

    c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));

    b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));

    d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));

}


inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    v_uint8x64 low, high;

    v_zip(x, y, low, high);

    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, low.val);

        _mm512_stream_si512((__m512i*)(ptr + 64), high.val);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, low.val);

        _mm512_store_si512((__m512i*)(ptr + 64), high.val);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, low.val);

        _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);

    }

}


inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    v_uint16x32 low, high;

    v_zip(x, y, low, high);

    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, low.val);

        _mm512_stream_si512((__m512i*)(ptr + 32), high.val);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, low.val);

        _mm512_store_si512((__m512i*)(ptr + 32), high.val);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, low.val);

        _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);

    }

}


inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    v_uint32x16 low, high;

    v_zip(x, y, low, high);

    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, low.val);

        _mm512_stream_si512((__m512i*)(ptr + 16), high.val);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, low.val);

        _mm512_store_si512((__m512i*)(ptr + 16), high.val);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, low.val);

        _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);

    }

}


inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    v_uint64x8 low, high;

    v_zip(x, y, low, high);

    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, low.val);

        _mm512_stream_si512((__m512i*)(ptr + 8), high.val);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, low.val);

        _mm512_store_si512((__m512i*)(ptr + 8), high.val);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, low.val);

        _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);

    }

}


inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

#if CV_AVX_512VBMI

    __m512i mask0 = _v512_set_epu8(127,  84,  20, 126,  83,  19, 125,  82,  18, 124,  81,  17, 123,  80,  16, 122,

                                    79,  15, 121,  78,  14, 120,  77,  13, 119,  76,  12, 118,  75,  11, 117,  74,

                                    10, 116,  73,   9, 115,  72,   8, 114,  71,   7, 113,  70,   6, 112,  69,   5,

                                   111,  68,   4, 110,  67,   3, 109,  66,   2, 108,  65,   1, 107,  64,   0, 106);

    __m512i mask1 = _v512_set_epu8( 21,  42, 105,  20,  41, 104,  19,  40, 103,  18,  39, 102,  17,  38, 101,  16,

                                    37, 100,  15,  36,  99,  14,  35,  98,  13,  34,  97,  12,  33,  96,  11,  32,

                                    95,  10,  31,  94,   9,  30,  93,   8,  29,  92,   7,  28,  91,   6,  27,  90,

                                     5,  26,  89,   4,  25,  88,   3,  24,  87,   2,  23,  86,   1,  22,  85,   0);

    __m512i mask2 = _v512_set_epu8(106, 127,  63, 105, 126,  62, 104, 125,  61, 103, 124,  60, 102, 123,  59, 101,

                                   122,  58, 100, 121,  57,  99, 120,  56,  98, 119,  55,  97, 118,  54,  96, 117,

                                    53,  95, 116,  52,  94, 115,  51,  93, 114,  50,  92, 113,  49,  91, 112,  48,

                                    90, 111,  47,  89, 110,  46,  88, 109,  45,  87, 108,  44,  86, 107,  43,  85);

    __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);

    __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);

    __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);


    __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);

    __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);

    __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);

#else

    __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));

    __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);

    __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);

    __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);


    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,

                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);

    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,

                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);

    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,

                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);

    __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);

    __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);

    __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);


    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);

    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);

    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);

#endif


    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, bgr0);

        _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);

        _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, bgr0);

        _mm512_store_si512((__m512i*)(ptr + 64), bgr1);

        _mm512_store_si512((__m512i*)(ptr + 128), bgr2);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, bgr0);

        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);

        _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);

    }

}


inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,

                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);

    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,

                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);

    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,

                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);

    __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);

    __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);

    __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);


    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);

    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);

    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);


    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, bgr0);

        _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);

        _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, bgr0);

        _mm512_store_si512((__m512i*)(ptr + 32), bgr1);

        _mm512_store_si512((__m512i*)(ptr + 64), bgr2);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, bgr0);

        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);

        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);

    }

}


inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);

    __m512i mask1 = _v512_set_epu32(31, 10, 25, 30,  9, 24, 29,  8, 23, 28,  7, 22, 27,  6, 21, 26);

    __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);

    __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);


    __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);

    __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);

    __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);


    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, bgr0);

        _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);

        _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, bgr0);

        _mm512_store_si512((__m512i*)(ptr + 16), bgr1);

        _mm512_store_si512((__m512i*)(ptr + 32), bgr2);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, bgr0);

        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);

        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);

    }

}


inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    __m512i mask0 = _v512_set_epu64( 5, 12,  7,  4, 11,  6,  3, 10);

    __m512i mask1 = _v512_set_epu64(15,  7,  4, 14,  6,  3, 13,  5);

    __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);

    __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);


    __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);

    __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);

    __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);


    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, bgr0);

        _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);

        _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, bgr0);

        _mm512_store_si512((__m512i*)(ptr + 8), bgr1);

        _mm512_store_si512((__m512i*)(ptr + 16), bgr2);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, bgr0);

        _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);

        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);

    }

}


inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b,

                                const v_uint8x64& c, const v_uint8x64& d,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    v_uint8x64 br01, br23, ga01, ga23;

    v_zip(a, c, br01, br23);

    v_zip(b, d, ga01, ga23);

    v_uint8x64 bgra0, bgra1, bgra2, bgra3;

    v_zip(br01, ga01, bgra0, bgra1);

    v_zip(br23, ga23, bgra2, bgra3);


    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, bgra0.val);

        _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);

        _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);

        _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, bgra0.val);

        _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);

        _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);

        _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);

        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);

        _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);

        _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);

    }

}


inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b,

                                const v_uint16x32& c, const v_uint16x32& d,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    v_uint16x32 br01, br23, ga01, ga23;

    v_zip(a, c, br01, br23);

    v_zip(b, d, ga01, ga23);

    v_uint16x32 bgra0, bgra1, bgra2, bgra3;

    v_zip(br01, ga01, bgra0, bgra1);

    v_zip(br23, ga23, bgra2, bgra3);


    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, bgra0.val);

        _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);

        _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);

        _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, bgra0.val);

        _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);

        _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);

        _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);

        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);

        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);

        _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);

    }

}


inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b,

                                const v_uint32x16& c, const v_uint32x16& d,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    v_uint32x16 br01, br23, ga01, ga23;

    v_zip(a, c, br01, br23);

    v_zip(b, d, ga01, ga23);

    v_uint32x16 bgra0, bgra1, bgra2, bgra3;

    v_zip(br01, ga01, bgra0, bgra1);

    v_zip(br23, ga23, bgra2, bgra3);


    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, bgra0.val);

        _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);

        _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);

        _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, bgra0.val);

        _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);

        _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);

        _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);

        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);

        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);

        _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);

    }

}


inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b,

                                const v_uint64x8& c, const v_uint64x8& d,

                                hal::StoreMode mode=hal::STORE_UNALIGNED )

{

    v_uint64x8 br01, br23, ga01, ga23;

    v_zip(a, c, br01, br23);

    v_zip(b, d, ga01, ga23);

    v_uint64x8 bgra0, bgra1, bgra2, bgra3;

    v_zip(br01, ga01, bgra0, bgra1);

    v_zip(br23, ga23, bgra2, bgra3);


    if( mode == hal::STORE_ALIGNED_NOCACHE )

    {

        _mm512_stream_si512((__m512i*)ptr, bgra0.val);

        _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);

        _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);

        _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);

    }

    else if( mode == hal::STORE_ALIGNED )

    {

        _mm512_store_si512((__m512i*)ptr, bgra0.val);

        _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);

        _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);

        _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);

    }

    else

    {

        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);

        _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);

        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);

        _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);

    }

}


#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \

inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \

{ \

    _Tpvec1 a1, b1; \

    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \

    a0 = v_reinterpret_as_##suffix0(a1); \

    b0 = v_reinterpret_as_##suffix0(b1); \

} \

inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \

{ \

    _Tpvec1 a1, b1, c1; \

    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \

    a0 = v_reinterpret_as_##suffix0(a1); \

    b0 = v_reinterpret_as_##suffix0(b1); \

    c0 = v_reinterpret_as_##suffix0(c1); \

} \

inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \

{ \

    _Tpvec1 a1, b1, c1, d1; \

    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \

    a0 = v_reinterpret_as_##suffix0(a1); \

    b0 = v_reinterpret_as_##suffix0(b1); \

    c0 = v_reinterpret_as_##suffix0(c1); \

    d0 = v_reinterpret_as_##suffix0(d1); \

} \

inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \

                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \

{ \

    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \

    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \

    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \

} \

inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \

                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \

{ \

    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \

    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \

    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \

    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \

} \

inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \

                                const _Tpvec0& c0, const _Tpvec0& d0, \

                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \

{ \

    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \

    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \

    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \

    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \

    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \

}


OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)

OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16)

OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32)

OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32)

OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)

OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64)


inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }

inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }

inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }

inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }


inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); }

inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); }

inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }

inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }

inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }

inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }


inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }

inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }

inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }

inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }

inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }

inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }

inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }

inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }


inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }

inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }

inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }

inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }

inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); }

inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); }

inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }

inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }

inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); }

inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); }

inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }

inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }


inline int v_scan_forward(const v_int8x64& a)

{

    int64 mask = _mm512_movepi8_mask(a.val);

    int mask32 = (int)mask;

    return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;

}

inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }

inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }

inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }

inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }

inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }

inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }

inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }

inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }

inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }


inline void v512_cleanup() { _mm256_zeroall(); }


CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END


} // cv::


#endif // OPENCV_HAL_INTRIN_AVX_HPP

idx
const int * idx
Definition core_c.h:668

x
const CvArr CvArr * x
Definition core_c.h:1195

result
const CvArr const CvArr CvArr * result
Definition core_c.h:1423

y
const CvArr * y
Definition core_c.h:1187

schar
signed char schar
Definition interface.h:48

uint
uint32_t uint
Definition interface.h:42

uchar
unsigned char uchar
Definition interface.h:51

int64
int64_t int64
Definition interface.h:61

ushort
unsigned short ushort
Definition interface.h:52

uint64
uint64_t uint64
Definition interface.h:62

cv::v_check_any
bool v_check_any(const v_reg< _Tp, n > &a)
Check if any of packed values is less than zero.
Definition intrin_cpp.hpp:1433

cv::v_matmul
v_reg< float, n > v_matmul(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication.
Definition intrin_cpp.hpp:3193

cv::v_round
v_reg< int, n > v_round(const v_reg< float, n > &a)
Round elements.
Definition intrin_cpp.hpp:2424

cv::v_signmask
int v_signmask(const v_reg< _Tp, n > &a)
Get negative values mask.
Definition intrin_cpp.hpp:1392

cv::v_zip
void v_zip(const v_reg< _Tp, n > &a0, const v_reg< _Tp, n > &a1, v_reg< _Tp, n > &b0, v_reg< _Tp, n > &b1)
Interleave two vectors.
Definition intrin_cpp.hpp:1554

cv::v_dotprod_expand
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements and expand.
Definition intrin_cpp.hpp:1142

cv::v_reduce_sad
V_TypeTraits< typenameV_TypeTraits< _Tp >::abs_type >::sum_type v_reduce_sad(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Sum absolute differences of values.
Definition intrin_cpp.hpp:1374

cv::v_ceil
v_reg< int, n > v_ceil(const v_reg< float, n > &a)
Ceil elements.
Definition intrin_cpp.hpp:2462

cv::v_pack_triplets
v_reg< _Tp, n > v_pack_triplets(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2733

cv::v_store_low
void v_store_low(_Tp *ptr, const v_reg< _Tp, n > &a)
Store data to memory (lower half)
Definition intrin_cpp.hpp:2216

cv::v_floor
v_reg< int, n > v_floor(const v_reg< float, n > &a)
Floor elements.
Definition intrin_cpp.hpp:2449

cv::v_dotprod
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Dot product of elements.
Definition intrin_cpp.hpp:1077

cv::v_scan_forward
int v_scan_forward(const v_reg< _Tp, n > &a)
Get first negative lane index.
Definition intrin_cpp.hpp:1409

cv::v_reverse
v_reg< _Tp, n > v_reverse(const v_reg< _Tp, n > &a)
Vector reverse order.
Definition intrin_cpp.hpp:2343

cv::v_absdiff
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_absdiff(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Add values without saturation.
Definition intrin_cpp.hpp:953

cv::v_interleave_pairs
v_reg< _Tp, n > v_interleave_pairs(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2703

cv::v_reduce_sum
V_TypeTraits< _Tp >::sum_type v_reduce_sum(const v_reg< _Tp, n > &a)
Element shift left among vector.
Definition intrin_cpp.hpp:1335

cv::v_muladd
v_reg< _Tp, n > v_muladd(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
A synonym for v_fma.
Definition intrin_cpp.hpp:1057

cv::v_trunc
v_reg< int, n > v_trunc(const v_reg< float, n > &a)
Truncate elements.
Definition intrin_cpp.hpp:2475

cv::v_invsqrt
v_reg< _Tp, n > v_invsqrt(const v_reg< _Tp, n > &a)
Inversed square root.
Definition intrin_cpp.hpp:1007

cv::v_dotprod_expand_fast
v_reg< typename V_TypeTraits< _Tp >::q_type, n/4 > v_dotprod_expand_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements and expand.
Definition intrin_cpp.hpp:1185

cv::v_cvt_f64_high
CV_INLINE v_reg< double,(n/2)> v_cvt_f64_high(const v_reg< int, n > &a)
Convert to double high part of vector.
Definition intrin_cpp.hpp:2584

cv::v_reduce_sum4
v_reg< float, n > v_reduce_sum4(const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Sums all elements of each input vector, returns the vector of sums.
Definition intrin_cpp.hpp:1353

cv::v_mul_expand
void v_mul_expand(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &c, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &d)
Multiply and expand.
Definition intrin_cpp.hpp:1216

cv::v_broadcast_element
v_reg< _Tp, n > v_broadcast_element(const v_reg< _Tp, n > &a)
Broadcast i-th element of vector.
Definition intrin_cpp.hpp:2413

cv::v_pack_store
void v_pack_store(hfloat *ptr, const v_reg< float, n > &v)
Definition intrin_cpp.hpp:3289

cv::v_interleave_quads
v_reg< _Tp, n > v_interleave_quads(const v_reg< _Tp, n > &vec)
Definition intrin_cpp.hpp:2716

cv::v_expand_low
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_low(const v_reg< _Tp, n > &a)
Expand lower values to the wider pack type.
Definition intrin_cpp.hpp:1496

cv::v_cvt_f64
CV_INLINE v_reg< double, n/2 > v_cvt_f64(const v_reg< int, n > &a)
Convert lower half to double.
Definition intrin_cpp.hpp:2573

cv::v_expand
void v_expand(const v_reg< _Tp, n > &a, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b0, v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > &b1)
Expand values to the wider pack type.
Definition intrin_cpp.hpp:1474

cv::v_pack_b
v_reg< uchar, 2 *n > v_pack_b(const v_reg< ushort, n > &a, const v_reg< ushort, n > &b)
! For 16-bit boolean values
Definition intrin_cpp.hpp:3111

cv::v_fma
v_reg< _Tp, n > v_fma(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, const v_reg< _Tp, n > &c)
Multiply and add.
Definition intrin_cpp.hpp:1046

cv::v_store_interleave
void v_store_interleave(_Tp *ptr, const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b, hal::StoreMode=hal::STORE_UNALIGNED)
Interleave and store (2 channels)
Definition intrin_cpp.hpp:2115

cv::v_lut_deinterleave
void v_lut_deinterleave(const float *tab, const v_reg< int, n > &idx, v_reg< float, n > &x, v_reg< float, n > &y)
Definition intrin_cpp.hpp:2681

cv::v_absdiffs
v_reg< _Tp, n > v_absdiffs(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Saturating absolute difference.
Definition intrin_cpp.hpp:994

cv::v_expand_high
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_expand_high(const v_reg< _Tp, n > &a)
Expand higher values to the wider pack type.
Definition intrin_cpp.hpp:1515

cv::v_dotprod_fast
v_reg< typename V_TypeTraits< _Tp >::w_type, n/2 > v_dotprod_fast(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Fast Dot product of elements.
Definition intrin_cpp.hpp:1116

cv::v_lut
v_reg< _Tp, simd128_width/sizeof(_Tp)> v_lut(const _Tp *tab, const int *idx)
Definition intrin_cpp.hpp:2626

cv::v_mul_hi
v_reg< _Tp, n > v_mul_hi(const v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)
Multiply and extract high part.
Definition intrin_cpp.hpp:1233

cv::v_cvt_f32
v_reg< float, n > v_cvt_f32(const v_reg< int, n > &a)
Convert to float.
Definition intrin_cpp.hpp:2534

cv::v_check_all
bool v_check_all(const v_reg< _Tp, n > &a)
Check if all packed values are less than zero.
Definition intrin_cpp.hpp:1421

cv::v_matmuladd
v_reg< float, n > v_matmuladd(const v_reg< float, n > &v, const v_reg< float, n > &a, const v_reg< float, n > &b, const v_reg< float, n > &c, const v_reg< float, n > &d)
Matrix multiplication and add.
Definition intrin_cpp.hpp:3223

cv::v_not_nan
v_reg< float, n > v_not_nan(const v_reg< float, n > &a)
Less-than comparison.
Definition intrin_cpp.hpp:890

cv::v_popcount
v_reg< typename V_TypeTraits< _Tp >::abs_type, n > v_popcount(const v_reg< _Tp, n > &a)
Count the 1 bits in the vector lanes and return result as corresponding unsigned type.
Definition intrin_cpp.hpp:828

cv::operator*=
CV_INLINE v_reg< _Tp, n > & operator*=(v_reg< _Tp, n > &a, const v_reg< _Tp, n > &b)

cv::v_load_deinterleave
void v_load_deinterleave(const _Tp *ptr, v_reg< _Tp, n > &a, v_reg< _Tp, n > &b)
Load and deinterleave (2 channels)
Definition intrin_cpp.hpp:2043

delta
CvSize int int int CvPoint int delta
Definition imgproc_c.h:1168

cv::mask
CV_EXPORTS OutputArray int double double InputArray mask
Definition imgproc.hpp:2132

cv::sum
OutputArray sum
Definition imgproc.hpp:2882

cv::hal::StoreMode
StoreMode
Definition intrin.hpp:100

cv::hal::STORE_ALIGNED_NOCACHE
@ STORE_ALIGNED_NOCACHE
Definition intrin.hpp:103

cv::hal::STORE_ALIGNED
@ STORE_ALIGNED
Definition intrin.hpp:102

cv::hal::STORE_UNALIGNED
@ STORE_UNALIGNED
Definition intrin.hpp:101

cv
"black box" representation of the file storage associated with a file on disk.
Definition calib3d.hpp:441

cv::operator*
DualQuat< T > operator*(const T a, const DualQuat< T > &q)
Definition dualquaternion.inl.hpp:274